-
Anton Sarukhanov authoredc2893bea
scrape.py 3.56 KiB
from lxml import html
import requests
import re
from urllib.parse import urlparse, urljoin
from itertools import repeat
def scrape(results_url):
try:
results = requests.get(results_url)
except requests.exceptions.MissingSchema:
results = requests.get("http://{}".format(results_url))
results_tree = html.fromstring(results.content)
try:
event_urls = results_tree.xpath(
'//div[@id="schedule"]/table/tr/td/a[text()="View"]/@href')
except IndexError:
return "No event schedule found"
tournament_name = results_tree.xpath(
'//span[@class="tournName"]/text()')[0]
tournament_details = results_tree.xpath(
'//span[@class="tournDetails"]/text()')[0]
events = []
for event_url in event_urls:
if not urlparse(event_url).netloc:
event_url = urljoin(results.url, event_url)
event = requests.get(event_url)
event_tree = html.fromstring(event.content)
event_details = event_tree.xpath(
'//span[@class="tournDetails"]/text()')
try:
event_name = event_details[0]
event_time = event_details[1]
except:
raise(Exception("Failed to interpret live results for event \"{}\".".format(event_url)))
if event_tree.xpath('//a[text()="Final Results"]'):
fencers = event_tree.xpath('//div[@id="finalResults"]/table/tr/td[2]/text()')
fencers = dict(zip(fencers, repeat("Checked In")))
event_status = "closed"
elif event_tree.xpath('//a[text()="Seeding"]'):
fencers = event_tree.xpath('//div[@id="Round1Seeding"]/table/tr/td[2]/text()')
fencers = dict(zip(fencers, repeat("Checked In")))
event_status = "ongoing"
elif event_tree.xpath('//a[text()="Check-In Status"]'):
event_status = "open"
fencers_checked_in = [True if len(list(f)) else False for f in event_tree.xpath('//div[@id="checkIn"]/table/tr/td[1]')]
fencers = event_tree.xpath('//div[@id="checkIn"]/table/tr/td[2]/text()')
fencers = dict(zip(fencers, fencers_checked_in))
try:
del this_event
except:
pass # not yet set, oh well
this_event = {
'name': event_name,
'time': event_time,
'status': event_status,
'fencers': [],
'fencers_checked_in': [],
'new_fencers_not_checked_in': [],
'previously_fenced': {},
'previous_total': 0
}
for fencer, is_checked_in in fencers.items():
fencer = fencer.strip()
this_event['fencers'].append(fencer)
if is_checked_in:
this_event['fencers_checked_in'].append(fencer)
else:
this_event['new_fencers_not_checked_in'].append(fencer)
for e in events:
if e['name'] == event_details:
continue
if fencer in e['fencers']:
if e['name'] in this_event['previously_fenced']:
this_event['previously_fenced'][e['name']] += 1
else:
this_event['previously_fenced'][e['name']] = 1
this_event['previous_total'] += 1
try:
this_event['new_fencers_not_checked_in'].remove(fencer)
except ValueError:
pass # already removed; ignore
break
events.append(this_event)
return (tournament_name, tournament_details, events)