Skip to content
Snippets Groups Projects
Commit 310f4eeb authored by Anton Sarukhanov's avatar Anton Sarukhanov
Browse files

Use asyncio instead of wasting time.

parent fed3abad
No related branches found
No related tags found
No related merge requests found
from lxml import html from lxml import html
import requests import requests
import asyncio
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urlparse, urljoin from urllib.parse import urlparse, urljoin
from models import Event, Fencer, Tournament from models import Event, Fencer, Tournament
...@@ -30,26 +33,33 @@ class Scraper: ...@@ -30,26 +33,33 @@ class Scraper:
'//div[@id="schedule"]/table/tr/td/a[text()="View"]/@href') '//div[@id="schedule"]/table/tr/td/a[text()="View"]/@href')
except IndexError: except IndexError:
raise ScrapeError("No event schedule found.") raise ScrapeError("No event schedule found.")
self.scrape_events(event_urls)
loop = asyncio.new_event_loop()
loop.run_until_complete(self.scrape_events(event_urls))
return self.tournament return self.tournament
def scrape_events(self, event_urls): async def scrape_events(self, event_urls):
for event_url in event_urls:
with ThreadPoolExecutor(max_workers=20) as executor:
loop = asyncio.get_event_loop()
futures = []
# Build full event URL (scraped URLs are relative) for event_url in event_urls:
# TODO: Is there a cleaner (less "DIY") way to do this? if not urlparse(event_url).netloc:
if not urlparse(event_url).netloc: event_url = urljoin(self.tournament.url, event_url)
event_url = urljoin(self.tournament.url, event_url) futures.append(loop.run_in_executor(
executor,
requests.get,
event_url))
event = self.scrape_event(event_url) for response in await asyncio.gather(*futures):
self.tournament.add_event(event) event = self.parse_event(response)
self.tournament.count_all_fencers() self.tournament.add_event(event)
def scrape_event(self, event_url): self.tournament.count_all_fencers()
# Request event page
event = requests.get(event_url)
def parse_event(self, event):
# Get the event details (name, time) as text # Get the event details (name, time) as text
event_tree = html.fromstring(event.content) event_tree = html.fromstring(event.content)
event_details = event_tree.xpath( event_details = event_tree.xpath(
...@@ -60,7 +70,7 @@ class Scraper: ...@@ -60,7 +70,7 @@ class Scraper:
except IndexError: except IndexError:
raise ScrapeError( raise ScrapeError(
"Failed to interpret live results for event \"{}\"." "Failed to interpret live results for event \"{}\"."
.format(event_url)) .format(event_details))
# Get the event status # Get the event status
if event_tree.xpath('//a[text()="Final Results"]'): if event_tree.xpath('//a[text()="Final Results"]'):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment