diff --git a/wkOrg/src/wkScraper/jvsCal.py b/wkOrg/src/wkScraper/jvsCal.py new file mode 100644 index 0000000..1728116 --- /dev/null +++ b/wkOrg/src/wkScraper/jvsCal.py @@ -0,0 +1,165 @@ +#! /usr/bin/env python3 +# -*- coding: UTF-8 -*- + +from bs4 import BeautifulSoup +import datetime +import json +import re +import requests + +jvsCalUrl = "https://judoverbandsachsen.de/kalender/?show=all" + +def parseJvsCal(url=jvsCalUrl, minYear = 0, minMonth = 0, onlyWithAks=False): + ''' + Parse the calender page of the jvs + + returns dictionary of dictionaries of list cal[year][month] = listOfUrls + ''' + jvsCalPage = requests.get(url) + jvsCalSoup = BeautifulSoup(jvsCalPage.content, "html.parser") + jvsCalEventListItems = jvsCalSoup.find(id="eventListItems") + jvsCalEventMonts = jvsCalEventListItems.find_all("div", class_="month") + + jvsWkList = {} + + for m in jvsCalEventMonts: + if m.has_attr("data-month"): + yearMonth = m.get("data-month") + year = int(yearMonth[0:4]) + if year < minYear: + continue + if not year in jvsWkList: + jvsWkList[year] = {} + month = int(yearMonth[4:6]) + if year==minYear and month= 2: + self.endDate = datetime.date.fromisoformat( timespan[1] ) + self.title = title + self.place = place + self.url = url + return + def toDict(self): + wkDict = {} + wkDict["date"] = str(self.date) + if self.endDate is not None: + wkDict["endDate"] = str(self.endDate) + wkDict["title"] = self.title + if self.place is not None: + wkDict["place"] = self.place + if self.url is not None: + wkDict["url"] = self.url + return wkDict + def toJson(self): + return json.dumps(self.toDict()) + +class wk(event): + def __init__(self, timespan, title, akList, place=None, url=None): + super().__init__(timespan, title, place, url) + self.akList = akList + def toDict(self): + wkDict = super().toDict() + wkDict["akList"] = self.akList + return wkDict + +def parseJvsEvent(jvsEvent, onlyIfWithAk=False): + try: + aks = None + for e in jvsEvent.find_all(class_="col-2"): + if e.find("time"): + time = [ t.strip() for t in re.sub( "\s+", " ", e.find("time").text.strip() ).split("-")] + time = [ t.split(".") for t in time ] + time = [ "-".join( [year, t[1], t[0]] ) for t in time ] + if e.find("span"): + aks = [ ak.text.strip() for ak in e.find_all("span") ] + aks = [ ak for ak in aks if ak != "" ] + if len(aks) == 0: + aks=None + place = event.find(class_="col-3").text.strip() + if place == "": + place = None + title = event.find(class_="col-4").find("a").text.strip() + url = event.find(class_="col-4").find("a")['href'] + titleFull = event.find(class_="col-4").text.strip() + assert(title==titleFull) + except: + print("Error parsing:") + print(event.prettify()) + + if aks is not None: + return wk(time, title, aks, place, url) + else: + if onlyIfWithAk: + return None + else: + return event(time, title, place, url) + + +events = jvsCal.parseJvsCal(minYear=datetime.date.today().year, minMonth=datetime.date.today().month, onlyWithAks=True) +#events = jvsCal.parseJvsCal() + +#print( jvsCal.parseJvsCal(minYear=datetime.date.today().year, minMonth=datetime.date.today().month, onlyWithAks=True) ) + +print( [ e['url'] for y in events for m in events[y] for e in events[y][m]] ) + +print( jvsCal.getWk([ e['url'] for y in events for m in events[y] for e in events[y][m]]) ) + +exit() + +jvsCalShiaiUrl = "https://judoverbandsachsen.de/events/23-raeucherkerzenpokal/" + +testWk = jvsCal.wettkampf.from_url( jvsCalShiaiUrl ) +print(testWk.to_json()) +exit() + + +#with open("rkp.html", "w") as f: +# f.write(BeautifulSoup(requests.get("https://judoverbandsachsen.de/events/23-raeucherkerzenpokal/").content, "html.parser").prettify()) +#exit() + + +url = "https://judoverbandsachsen.de/kalender/?show=all" + +jvsCalShiaiUrl = "https://judoverbandsachsen.de/events/23-raeucherkerzenpokal/" +#jvsCalShiaiUrl = "file://rkp.html" +#jvsCalShiaiPage = requests.get(jvsCalShiaiUrl) +#jvsCalShiaiSoup = BeautifulSoup(jvsCalShiaiPage.content, "html.parser") +jvsCalShiaiSoup = BeautifulSoup(open("rkp.html"), "html.parser") + +year = "2020" +for e in jvsCalShiaiSoup.find_all(class_="event-single"): + print(e.prettify()) + title = e.find("header").text.strip() + articleTag= e.find("article") + date = [ t.strip() for t in re.sub( "\s+", " ", articleTag.find("time").text.strip() ).split("-")] + date = [ t.split(".") for t in date ] + date = [ "-".join( [year, t[1], t[0]] ) for t in date ] + + [announcementDiv, placeDiv, ageGroupsDiv] = articleTag.find_all("div") + + announcement = {} + place = {} + for dt, dd in zip(articleTag.find_all("dt"), articleTag.find_all("dd")): + if dt.text.strip() == "Ausschreibung:": + announcement['url'] = dd.find("a")['href'] + if dt.text.strip() == "Veranstalter:": + announcement['organizer'] = dd.text.strip() + if dt.text.strip() == "Veranstaltungsort:": + place['name'] = dd.text.strip() + if dt.text.strip() == "Veranstaltungsadresse:": + place['address'] = re.sub("\s+", " ", dd.text.strip()) +# print(dt.text, dd.text) + + ageGroups = [ ak.text.strip() for ak in ageGroupsDiv.find_all("span") ] + +print(f"title: {title})") +print(f"date: {date})") +print(f"announcement: {announcement}") +print(f"place: {place}") +print(f"ageGroups: {ageGroups}") +exit() + + +jvsCalPage = requests.get(url) + +jvsCalSoup = BeautifulSoup(jvsCalPage.content, "html.parser") + +#jvsCalEventList = jvsCalSoup.find(id="eventList") +#print(jvsCalEventList.prettify()) + +jvsCalEventListItems = jvsCalSoup.find(id="eventListItems") +#print(jvsCalEventListItems.prettify()) + +jvsCalEventMonts = jvsCalEventListItems.find_all("div", class_="month") + +jvsWkList = [] + +for m in jvsCalEventMonts: + if m.has_attr("data-month"): + yearMonth = m.get("data-month") + year = (yearMonth[0:4]) + month = yearMonth[4:6] + print(f"Jahr: {year}, Monat: {month}") + events = m.find_all(class_="posts") + for event in events: + parsedEvent = parseJvsEvent(event, onlyIfWithAk=True) + if parsedEvent is not None: + jvsWkList.append(parsedEvent) + else: + print("no data-month") + +for w in jvsWkList: + print(w.toJson())