From 310f4eeb732340c39e0f5b334eb073cd0a958691 Mon Sep 17 00:00:00 2001 From: Anton Sarukhanov <code@ant.sr> Date: Sun, 15 Apr 2018 17:16:12 -0400 Subject: [PATCH] Use asyncio instead of wasting time. --- scraper.py | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/scraper.py b/scraper.py index a564b0c..025064e 100644 --- a/scraper.py +++ b/scraper.py @@ -1,6 +1,9 @@ from lxml import html import requests +import asyncio +from concurrent.futures import ThreadPoolExecutor from urllib.parse import urlparse, urljoin + from models import Event, Fencer, Tournament @@ -30,26 +33,33 @@ class Scraper: '//div[@id="schedule"]/table/tr/td/a[text()="View"]/@href') except IndexError: raise ScrapeError("No event schedule found.") - self.scrape_events(event_urls) + + loop = asyncio.new_event_loop() + loop.run_until_complete(self.scrape_events(event_urls)) return self.tournament - def scrape_events(self, event_urls): - for event_url in event_urls: + async def scrape_events(self, event_urls): + + with ThreadPoolExecutor(max_workers=20) as executor: + loop = asyncio.get_event_loop() + futures = [] - # Build full event URL (scraped URLs are relative) - # TODO: Is there a cleaner (less "DIY") way to do this? - if not urlparse(event_url).netloc: - event_url = urljoin(self.tournament.url, event_url) + for event_url in event_urls: + if not urlparse(event_url).netloc: + event_url = urljoin(self.tournament.url, event_url) + futures.append(loop.run_in_executor( + executor, + requests.get, + event_url)) - event = self.scrape_event(event_url) - self.tournament.add_event(event) - self.tournament.count_all_fencers() + for response in await asyncio.gather(*futures): + event = self.parse_event(response) + self.tournament.add_event(event) - def scrape_event(self, event_url): - # Request event page - event = requests.get(event_url) + self.tournament.count_all_fencers() + def parse_event(self, event): # Get the event details (name, time) as text event_tree = html.fromstring(event.content) event_details = event_tree.xpath( @@ -60,7 +70,7 @@ class Scraper: except IndexError: raise ScrapeError( "Failed to interpret live results for event \"{}\"." - .format(event_url)) + .format(event_details)) # Get the event status if event_tree.xpath('//a[text()="Final Results"]'): -- GitLab