Files
cwsvJudo/wkOrg/src/wkScraper/wkScraper-JvsCal.py
2021-10-17 12:17:54 +02:00

205 lines
5.6 KiB
Python
Executable File

#! /usr/bin/env python3
# -*- coding: UTF-8 -*-
import re
import requests
from bs4 import BeautifulSoup
import datetime
import json
import jvsCal
class shiai:
def __init__(self, titel, date, ageGroups):
self.title = title
self.date = date
self.ageGroups = ageGroups
return
def toDict(self):
return {
'title': self.title,
'date': self.date,
'ageGroups': self.ageGroups,
}
def toJson(self):
return json.dumps(self.toDict())
class event:
'''
ein Event besteht aus:
- Zeitspanne
- Startdatum (+Endatum)
- Titel (Name)
- Ort
- Url
'''
def __init__(self, timespan, title, place=None, url=None):
self.date = datetime.date.fromisoformat( timespan[0] )
self.endDate = None
if len(timespan) >= 2:
self.endDate = datetime.date.fromisoformat( timespan[1] )
self.title = title
self.place = place
self.url = url
return
def toDict(self):
wkDict = {}
wkDict["date"] = str(self.date)
if self.endDate is not None:
wkDict["endDate"] = str(self.endDate)
wkDict["title"] = self.title
if self.place is not None:
wkDict["place"] = self.place
if self.url is not None:
wkDict["url"] = self.url
return wkDict
def toJson(self):
return json.dumps(self.toDict())
class wk(event):
def __init__(self, timespan, title, akList, place=None, url=None):
super().__init__(timespan, title, place, url)
self.akList = akList
def toDict(self):
wkDict = super().toDict()
wkDict["akList"] = self.akList
return wkDict
def parseJvsEvent(jvsEvent, onlyIfWithAk=False):
try:
aks = None
for e in jvsEvent.find_all(class_="col-2"):
if e.find("time"):
time = [ t.strip() for t in re.sub( "\s+", " ", e.find("time").text.strip() ).split("-")]
time = [ t.split(".") for t in time ]
time = [ "-".join( [year, t[1], t[0]] ) for t in time ]
if e.find("span"):
aks = [ ak.text.strip() for ak in e.find_all("span") ]
aks = [ ak for ak in aks if ak != "" ]
if len(aks) == 0:
aks=None
place = event.find(class_="col-3").text.strip()
if place == "":
place = None
title = event.find(class_="col-4").find("a").text.strip()
url = event.find(class_="col-4").find("a")['href']
titleFull = event.find(class_="col-4").text.strip()
assert(title==titleFull)
except:
print("Error parsing:")
print(event.prettify())
if aks is not None:
return wk(time, title, aks, place, url)
else:
if onlyIfWithAk:
return None
else:
return event(time, title, place, url)
if __name__=="__main__":
events = jvsCal.parseJvsCal(minYear=datetime.date.today().year, minMonth=datetime.date.today().month, onlyWithAks=True)
# print(f"{json.dumps(events, indent=2)}")
for year in events:
for month in events[year]:
for event in events[year][month]:
print(f"{event}")
wk = jvsCal.wettkampf.from_url( event['url'] )
print(f"{wk.to_json()}")
exit(-1)
print( [ e['url'] for y in events for m in events[y] for e in events[y][m]] )
print( jvsCal.getWk([ e['url'] for y in events for m in events[y] for e in events[y][m]]) )
exit()
jvsCalShiaiUrl = "https://judoverbandsachsen.de/events/23-raeucherkerzenpokal/"
testWk = jvsCal.wettkampf.from_url( jvsCalShiaiUrl )
print(testWk.to_json())
exit()
#with open("rkp.html", "w") as f:
# f.write(BeautifulSoup(requests.get("https://judoverbandsachsen.de/events/23-raeucherkerzenpokal/").content, "html.parser").prettify())
#exit()
url = "https://judoverbandsachsen.de/kalender/?show=all"
jvsCalShiaiUrl = "https://judoverbandsachsen.de/events/23-raeucherkerzenpokal/"
#jvsCalShiaiUrl = "file://rkp.html"
#jvsCalShiaiPage = requests.get(jvsCalShiaiUrl)
#jvsCalShiaiSoup = BeautifulSoup(jvsCalShiaiPage.content, "html.parser")
jvsCalShiaiSoup = BeautifulSoup(open("rkp.html"), "html.parser")
year = "2020"
for e in jvsCalShiaiSoup.find_all(class_="event-single"):
print(e.prettify())
title = e.find("header").text.strip()
articleTag= e.find("article")
date = [ t.strip() for t in re.sub( "\s+", " ", articleTag.find("time").text.strip() ).split("-")]
date = [ t.split(".") for t in date ]
date = [ "-".join( [year, t[1], t[0]] ) for t in date ]
[announcementDiv, placeDiv, ageGroupsDiv] = articleTag.find_all("div")
announcement = {}
place = {}
for dt, dd in zip(articleTag.find_all("dt"), articleTag.find_all("dd")):
if dt.text.strip() == "Ausschreibung:":
announcement['url'] = dd.find("a")['href']
if dt.text.strip() == "Veranstalter:":
announcement['organizer'] = dd.text.strip()
if dt.text.strip() == "Veranstaltungsort:":
place['name'] = dd.text.strip()
if dt.text.strip() == "Veranstaltungsadresse:":
place['address'] = re.sub("\s+", " ", dd.text.strip())
# print(dt.text, dd.text)
ageGroups = [ ak.text.strip() for ak in ageGroupsDiv.find_all("span") ]
print(f"title: {title})")
print(f"date: {date})")
print(f"announcement: {announcement}")
print(f"place: {place}")
print(f"ageGroups: {ageGroups}")
exit()
jvsCalPage = requests.get(url)
jvsCalSoup = BeautifulSoup(jvsCalPage.content, "html.parser")
#jvsCalEventList = jvsCalSoup.find(id="eventList")
#print(jvsCalEventList.prettify())
jvsCalEventListItems = jvsCalSoup.find(id="eventListItems")
#print(jvsCalEventListItems.prettify())
jvsCalEventMonts = jvsCalEventListItems.find_all("div", class_="month")
jvsWkList = []
for m in jvsCalEventMonts:
if m.has_attr("data-month"):
yearMonth = m.get("data-month")
year = (yearMonth[0:4])
month = yearMonth[4:6]
print(f"Jahr: {year}, Monat: {month}")
events = m.find_all(class_="posts")
for event in events:
parsedEvent = parseJvsEvent(event, onlyIfWithAk=True)
if parsedEvent is not None:
jvsWkList.append(parsedEvent)
else:
print("no data-month")
for w in jvsWkList:
print(w.toJson())