From c1541bda6ad266e44a3e8d9891f023bb0aaeeab0 Mon Sep 17 00:00:00 2001 From: Anton Sarukhanov <code@ant.sr> Date: Tue, 3 Dec 2019 22:20:09 -0500 Subject: [PATCH] Add support for Fencing Time Live. --- app.py | 39 +++++++----- models.py | 9 ++- scraper.py | 141 ++++++++++++++++++++++++++++++++++++++----- templates/index.html | 64 +++++++++----------- templates/live.html | 4 +- 5 files changed, 187 insertions(+), 70 deletions(-) diff --git a/app.py b/app.py index caa2155..bbea236 100644 --- a/app.py +++ b/app.py @@ -2,10 +2,10 @@ from flask import Flask, render_template, redirect, request, url_for from flask_caching import Cache -from scraper import FTPScraper +from scraper import FTPScraper, FTLiveScraper import models -# pylint: disable=invalid-name +# pylint: disable=invalid-name ; These module-level variables are standard for Flask. app = Flask(__name__) cache = Cache(app, config={'CACHE_TYPE': 'simple'}) @@ -14,6 +14,13 @@ DISPLAY_DATE_FORMAT = '%A, %B %-d, %Y' DISPLAY_TIME_FORMAT = '%-I:%M %p on %A' +def _make_cache_key(): + """Create a cache key for Flask-Caching.""" + path = request.path + args = str(hash(frozenset(request.args.items()))) + return (path + args).encode('utf-8') + + @app.after_request def add_header(response): """Add an HTTP response header for cache invalidation.""" @@ -23,36 +30,36 @@ def add_header(response): @app.template_filter('strftime') def _jinja2_filter_datetime(datetime, date=True, time=True): + """Format a DateTime for display to a user.""" return datetime.strftime(DISPLAY_DATETIME_FORMAT if date and time else DISPLAY_DATE_FORMAT if date else DISPLAY_TIME_FORMAT if time else '') @app.route("/") +@cache.cached(timeout=300) def index(): """Render the app landing page.""" - return render_template('index.html') - - -def _make_cache_key(): - """Create a cache key for Flask-Caching.""" - path = request.path - args = str(hash(frozenset(request.args.items()))) - return (path + args).encode('utf-8') + ftl_scraper = FTLiveScraper() + return render_template('index.html', + tournaments=ftl_scraper.list_tournaments()) @app.route("/live") @cache.cached(timeout=300, key_prefix=_make_cache_key) -def live(results_url=None): +def live(): """Render the primary view of live tournament stats.""" results_url = request.args.get('results_url') - if not results_url: + ftl_id = request.args.get('ftl_id') + + if ftl_id: + tournament = FTLiveScraper().scrape_tournament(tournament_id=ftl_id) + elif results_url: + tournament = FTPScraper(results_url).scrape_tournament() + else: return redirect(url_for('index')) - scraper = FTPScraper(results_url) - tournament = scraper.scrape() - return render_template('live.html', tournament=tournament, events=tournament.events, - phases=models.EventPhase) + return render_template('live.html', tournament=tournament, phases=models.EventPhase) if __name__ == "__main__": diff --git a/models.py b/models.py index cb7a4a2..3dbf3a1 100644 --- a/models.py +++ b/models.py @@ -6,7 +6,7 @@ from datetime import datetime from typing import List from enum import Enum -Fencer = namedtuple('Fencer', 'name is_checked_in') +Fencer = namedtuple('Fencer', 'name is_checked_in ftl_id', defaults=[None]) setattr(Fencer, '__eq__', lambda f1, f2: f1.name == f2.name) @@ -16,7 +16,9 @@ class Tournament: name: str url: str - updated: datetime + ftl_id: str = None # type: ignore + updated: datetime = None # type: ignore + location: str = None # type: ignore events: List['Event'] = field(default_factory=list) def count_fencers(self): @@ -47,7 +49,7 @@ class EventStatistics: @dataclass -class Event: +class Event: # pylint: disable=too-many-instance-attributes """A single event in a tournament (e.g. Y12 Mens Foil).""" name: str @@ -55,6 +57,7 @@ class Event: time: datetime tournament: Tournament phase: EventPhase + ftl_id: str = None # type: ignore stats: EventStatistics = field(default_factory=EventStatistics) fencers: List[Fencer] = field(default_factory=list) diff --git a/scraper.py b/scraper.py index 05a4475..37f9e59 100644 --- a/scraper.py +++ b/scraper.py @@ -2,14 +2,24 @@ import asyncio from concurrent.futures import ThreadPoolExecutor -from datetime import datetime -from urllib.parse import urlparse, urljoin -from lxml import html # nosec Bandit suggests defusedxml but defusedxml.lxml is dead +from datetime import date, datetime, timedelta +from urllib.parse import urlparse, urljoin, urlencode +from lxml import html # nosec ; Bandit suggests defusedxml but defusedxml.lxml is dead import requests from models import Event, EventPhase, Fencer, Tournament +# pylint: disable=too-few-public-methods ; I'm ok with that. -class FTPScraper: + +class Scraper: + """Base class.""" + + def __init__(self): + """Initialize common args for scrapers.""" + self.tournament = None + + +class FTPScraper(Scraper): """Scraper for tournaments hosted on an FTP server. This reads the original Fencing Time results pages, hosted by individual @@ -22,18 +32,20 @@ class FTPScraper: def __init__(self, tournament_url): """Set up the scraper instance.""" self.tournament_url = tournament_url - self.tournament = None + super(FTPScraper, self).__init__() - def scrape(self): + def scrape_tournament(self): """Get all tournament information.""" try: results = requests.get(self.tournament_url) except requests.exceptions.MissingSchema: results = requests.get("http://{}".format(self.tournament_url)) - results_tree = html.fromstring(results.content) + tournament_etree = html.fromstring(results.content) try: - tournament_name = results_tree.xpath('//span[@class="tournName"]/text()')[0] - updated_str = (results_tree.xpath('//span[@class="lastUpdate"]/text()')[0] + tournament_name = tournament_etree.xpath( + '//span[@class="tournName"]/text()')[0] + updated_str = (tournament_etree.xpath( + '//span[@class="lastUpdate"]/text()')[0] .replace('Last Updated:', '').strip()) updated = datetime.strptime(updated_str, self.UPDATED_DATETIME_FORMAT) except IndexError: @@ -44,7 +56,7 @@ class FTPScraper: # Get tournament events try: - event_urls = results_tree.xpath( + event_urls = tournament_etree.xpath( '//div[@id="schedule"]/table/tr/td/a[text()="View"]/@href') except IndexError: raise ScrapeError("No event schedule found.") @@ -66,12 +78,12 @@ class FTPScraper: futures.append(loop.run_in_executor(executor, requests.get, event_url)) for response in await asyncio.gather(*futures): - event = self.parse_event(response) + event = self._parse_event(response) self.tournament.events.append(event) self.tournament.count_fencers() - def parse_event(self, event): + def _parse_event(self, event): """Extract useful strings from the event info.""" event_tree = html.fromstring(event.content) event_details = event_tree.xpath('//span[@class="tournDetails"]/text()') @@ -106,12 +118,113 @@ class FTPScraper: url=event.url, fencers=fencers, tournament=self.tournament) -class FTLiveScraper(FTPScraper): +class FTLiveScraper(Scraper): """Scraper for tournaments hosted on fencingtimelive.com. This reads the newer-style pages, centrally hosted by Fencing Time. """ - # to do... + + BASE_URL = 'https://fencingtimelive.com' + TOURNAMENTS_URL = urljoin(BASE_URL, 'tournaments/list/data?{query}') + TOURNAMENT_URL = urljoin(BASE_URL, 'tournaments/eventSchedule/{tournament_id}') + FENCERS_URL = urljoin(BASE_URL, 'events/competitors/data/{event_id}') + EVENT_URL = urljoin(BASE_URL, 'events/view/{event_id}') + START_FORMAT = '%Y-%m-%dT%H:%M:%S.000Z' + EVENT_DATETIME_FORMAT = '%A %B %d, %Y %I:%M %p' + MAX_AGO = timedelta(days=7) + MAX_AHEAD = timedelta(days=7) + + def list_tournaments(self, search=None, from_date=None, to_date=None): + """Get a list of tournaments in FTLive.""" + if not search and not from_date and not to_date: + from_date = date.today() - self.MAX_AGO + to_date = date.today() + self.MAX_AHEAD + args = { + 'tname': search or '', + 'from': from_date or '', + 'to': to_date or '' + } + url = self.TOURNAMENTS_URL.format(query=urlencode(args)) + tournaments = requests.get(url).json() + return [{'start': datetime.strptime(t['start'], self.START_FORMAT), + 'id': t['id'], + 'name': t['name'], + 'location': t['location']} + for t in tournaments] + + def scrape_tournament(self, tournament_id): + """Get all tournament information.""" + tournament_url = self.TOURNAMENT_URL.format(tournament_id=tournament_id) + tournament_html = requests.get(tournament_url).content + tournament_etree = html.fromstring(tournament_html) + try: + tournament_name = tournament_etree.xpath( + '//div[@class="desktop tournName"]/text()')[0] + except IndexError: + raise ScrapeError("Tournament info not found.") + self.tournament = Tournament(name=tournament_name, url=tournament_url, + ftl_id=tournament_id) + + event_data = tournament_etree.xpath( + "//tr[re:test(@id, 'ev_.*')]", + namespaces={"re": "http://exslt.org/regular-expressions"}) + + for event in event_data: + self.tournament.events.append(self._parse_event(event)) + + loop = asyncio.new_event_loop() + loop.run_until_complete(self._get_fencers()) + + return self.tournament + + async def _get_fencers(self): + """Get event information asynchronously.""" + def get_fencers(event): + fencers_url = self.FENCERS_URL.format(event_id=event.ftl_id) + response = requests.get(fencers_url).json() + event.fencers = [ + Fencer(name=f['name'], ftl_id=f['id'], + is_checked_in=(f['status'] == 'CheckedIn')) + for f in response if f['status'] != 'Scratched'] + return event + + with ThreadPoolExecutor(max_workers=20) as executor: + loop = asyncio.get_event_loop() + futures = [] + + for event in self.tournament.events: + futures.append(loop.run_in_executor(executor, get_fencers, event)) + + await asyncio.gather(*futures) + + self.tournament.count_fencers() + + def _parse_event(self, event_etree): + """Extract useful strings from the event info.""" + try: + name = event_etree.getchildren()[1].text_content().strip() + ftl_id = event_etree.attrib['data-href'].split('/')[-1] + url = self.EVENT_URL.format(event_id=ftl_id) + time_str = event_etree.getchildren()[0].text_content().strip() + table = next(event_etree.iterancestors('table')) + date_str = next(table.itersiblings('h5', preceding=True)).text_content() + dt_str = '{} {}'.format(date_str, time_str) + time = datetime.strptime(dt_str, self.EVENT_DATETIME_FORMAT) + return Event(name=name, url=url, time=time, tournament=self.tournament, + phase=self._get_event_phase(event_etree), ftl_id=ftl_id) + except (IndexError, ScrapeError) as exc: + raise ScrapeError("Failed to interpret live results for event \"{}\". {}" + .format(name, exc)) + + def _get_event_phase(self, event_etree): # pylint: disable=no-self-use + """Determine the state of the event.""" + if event_etree.xpath('td[3]/text()[contains(., "Finished")]'): + event_phase = EventPhase.FINISHED + elif event_etree.xpath('td[3]/text()[contains(., "Fencing")]'): + event_phase = EventPhase.STARTED + else: + event_phase = EventPhase.REGISTRATION + return event_phase class ScrapeError(Exception): diff --git a/templates/index.html b/templates/index.html index 4853d48..8561459 100644 --- a/templates/index.html +++ b/templates/index.html @@ -7,40 +7,34 @@ <p> Welcome! Please select a Live Results URL, or enter your own. </p> - <p> - <form action="{{ url_for('live') }}" method="get"> - <label>Live Results URL: - <select name="results_url" onchange="this.form.submit()"> - <option value="" selected disabled>-- Select One --</option> - <optgroup label="Escrime Management"> - <option value="http://escrimeresults.com/tournaments/NCAA.html">NCAA</option> - <option value="http://escrimeresults.com/tournaments/Atlantic-Coast-Conference.html">Atlantic Coast Conference</option> - <option value="http://escrimeresults.com/tournaments/Ivy-League.html">Ivy League</option> - <option value="http://escrimeresults.com/cobra/">Cobra</option> - <option value="http://escrimeresults.com/thrust/">Thrust</option> - <option value="http://escrimeresults.com/tournaments/NJSIAA.html">NJSIAA</option> - <option value="http://escrimeresults.com/Candlewood/">Candlewood</option> - <option value="http://escrimeresults.com/tournaments/U-Penn.html">U Penn</option> - <option value="http://escrimeresults.com/tournaments/Temple-University.html">Temple U</option> - <option value="http://escrimeresults.com/Big1/">Big1</option> - <option value="http://escrimeresults.com/tournaments/US-Collegiate-Squad-Championships.html">US Collegiate Squad</option> - <option value="http://escrimeresults.com/MKHS/">MKHS</option> - </optgroup> - <optgroup label="NJ Division"> - <option value="http://njfencingresults.org/liveresults/">NJFencingResults.org/liveresults</option> - </optgroup> - </select> - </label> - <input type="submit" value="Go!"> - </form> - </p> - <p> - <form action="{{ url_for('live') }}" method="get"> - <label>Other URL: - <input name="results_url" placeholder="example.com/liveresults"> - </label> - <input type="submit" value="Go!"> - </form> - </p> + <section> + <h2>Fencing Time Live</h2> + <p> + <form action="{{ url_for('live') }}" method="get"> + <label>Choose a Tournament: + <select name="ftl_id" onchange="this.form.submit()"> + <option value="" selected disabled>-- Select One --</option> + {% for tournament in tournaments | sort(attribute='start') %} + {% if loop.changed(tournament.start) %}</optgroup><optgroup label="{{ tournament.start | strftime(time=False) }}">{% endif %} + <option value="{{ tournament['id'] }}">{{ tournament['name'] }} ({{ tournament['location'] }})</option> + {% if loop.last %}</optgroup>{% endif %} + {% endfor %} + </select> + </label> + <input type="submit" value="Go!"> + </form> + </p> + </section> + <section> + <h2>FTP Live Results</h2> + <p> + <form action="{{ url_for('live') }}" method="get"> + <label>Enter a custom Live Results link: + <input name="results_url" placeholder="example.com/liveresults"> + </label> + <input type="submit" value="Go!"> + </form> + </p> + </section> </main> {% endblock content %} diff --git a/templates/live.html b/templates/live.html index ae8a4cf..b55e483 100644 --- a/templates/live.html +++ b/templates/live.html @@ -5,12 +5,12 @@ {% block content %} <header> <span class="back-to-home"><a href="{{ url_for('index') }}">Back to Home</a></span> - <h1>{{ tournament.name }} - {{ events | length }} events + <h1>{{ tournament.name }} - {{ tournament.events | length }} events <a href="{{ tournament.url }}" target="_blank"> <img class="ext-link" src="{{ url_for('static', filename='images/font-awesome/external-link-alt.svg') }}?t=20180415"></a></h1> </header> <main> - {% for e in events | sort(attribute='time') %} + {% for e in tournament.events | sort(attribute='time') %} {% if loop.changed(e.time.date()) %} <header><h2>{{ e.time | strftime(time=False) }}</h2></header> {% endif %} -- GitLab