Skip to content

Commit

Permalink
scrape association details from new portal urls
Browse files Browse the repository at this point in the history
relates to #77
removes statically coded association abbreviations
subordinates district parsing to association parsing.
previously all associations would be scraped before the first district.
  • Loading branch information
djbrown authored Aug 3, 2023
1 parent 400c27f commit 013af77
Show file tree
Hide file tree
Showing 6 changed files with 3,171 additions and 57 deletions.
22 changes: 0 additions & 22 deletions src/associations/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,25 +20,3 @@ def build_source_url(bhv_id):

def source_url(self):
return self.build_source_url(self.bhv_id)

@staticmethod
def get_association_abbreviation(association_name):
association_abbreviations = {
'Badischer Handball-Verband': 'BHV',
'Fédération Luxembourgeoise de Handball': 'FLH',
'Hamburger Handball-Verband': 'HHV',
'Handball Baden-Württemberg': 'HBW',
'Handball-Verband Saar': 'HVS',
'Handballoberliga Rheinland-Pfalz/Saar': 'RPS',
'Handballverband Rheinhessen': 'HVR',
'Handballverband Schleswig-Holstein': 'HVSH',
'Handballverband Westfalen': 'HVWF',
'Handballverband Württemberg': 'HVW',
'Oberliga Hamburg - Schleswig-Holstein': 'HHSH',
'Pfälzer Handballverband': 'PfHV',
'Südbadischer Handballverband': 'SHV',
'Vorarlberger Handballverband': 'VHV',
# 'Mitteldeutscher Handball-Verband': 'MHV',
# 'Thüringer Handball-Verband': 'THV',
}
return association_abbreviations[association_name]
25 changes: 12 additions & 13 deletions src/base/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,28 +15,27 @@ def html_dom(html_text: str) -> _Element:
return html.fromstring(html_text)


def parse_association_urls(dom: _Element, root_url: str) -> list[str]:
items = cast(list[str], dom.xpath('//div[@id="main-content"]//table[@summary]/tbody/tr/td[1]/a/@href'))
return [item if item.startswith('http') else root_url + item for item in items]


def parse_association_bhv_id_from_dom(dom: _Element) -> int:
[bhv_id] = cast(list[str], dom.xpath('//div[@id="app"]/@data-og-id'))
return int(bhv_id)


def parse_link_query_item(link: _Element, query_key: str) -> str:
href = link.get('href')
query = cast(str, urlsplit(href).query)
return parse_qs(query)[query_key][0]


def parse_association_bhv_id(link: _Element) -> int:
return int(parse_link_query_item(link, 'orgGrpID'))
def parse_association_urls(dom: _Element) -> list[str]:
return cast(list[str], dom.xpath('//ul[@id="main-navi"]/li[contains(@class, "active")]//li/a/@href'))


def parse_association_abbreviation(association_url: str) -> str:
return association_url.rsplit('/', 1)[1]


def parse_association_name(dom: _Element) -> str:
return cast(list[str], dom.xpath('//*[@id="results"]/div/h1/text()[2]'))[0]
return cast(list[str], dom.xpath('//h2/a/text()'))[0]


def parse_association_bhv_id(dom: _Element) -> int:
[bhv_id] = cast(list[str], dom.xpath('//div[@id="app"]/@data-og-id'))
return int(bhv_id)


def parse_district_items(dom: _Element) -> list[_Element]:
Expand Down
Loading

0 comments on commit 013af77

Please sign in to comment.