Zum Commit vorgemerkte Änderungen:
neue Datei: src/wkScraper/jvsCal.py neue Datei: src/wkScraper/wkScraper-JvsCal.py
This commit is contained in:
165
wkOrg/src/wkScraper/jvsCal.py
Normal file
165
wkOrg/src/wkScraper/jvsCal.py
Normal file
@@ -0,0 +1,165 @@
|
|||||||
|
#! /usr/bin/env python3
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
|
||||||
|
jvsCalUrl = "https://judoverbandsachsen.de/kalender/?show=all"
|
||||||
|
|
||||||
|
def parseJvsCal(url=jvsCalUrl, minYear = 0, minMonth = 0, onlyWithAks=False):
|
||||||
|
'''
|
||||||
|
Parse the calender page of the jvs
|
||||||
|
|
||||||
|
returns dictionary of dictionaries of list cal[year][month] = listOfUrls
|
||||||
|
'''
|
||||||
|
jvsCalPage = requests.get(url)
|
||||||
|
jvsCalSoup = BeautifulSoup(jvsCalPage.content, "html.parser")
|
||||||
|
jvsCalEventListItems = jvsCalSoup.find(id="eventListItems")
|
||||||
|
jvsCalEventMonts = jvsCalEventListItems.find_all("div", class_="month")
|
||||||
|
|
||||||
|
jvsWkList = {}
|
||||||
|
|
||||||
|
for m in jvsCalEventMonts:
|
||||||
|
if m.has_attr("data-month"):
|
||||||
|
yearMonth = m.get("data-month")
|
||||||
|
year = int(yearMonth[0:4])
|
||||||
|
if year < minYear:
|
||||||
|
continue
|
||||||
|
if not year in jvsWkList:
|
||||||
|
jvsWkList[year] = {}
|
||||||
|
month = int(yearMonth[4:6])
|
||||||
|
if year==minYear and month<minMonth:
|
||||||
|
continue
|
||||||
|
if month not in jvsWkList[year]:
|
||||||
|
jvsWkList[year][month] = []
|
||||||
|
events = m.find_all(class_="posts")
|
||||||
|
for event in events:
|
||||||
|
# url = event.find(class_="col-4").find("a")['href']
|
||||||
|
time, title, aks, place, url = parseJvsEvent(event, year)
|
||||||
|
if aks is None and onlyWithAks:
|
||||||
|
continue
|
||||||
|
jvsWkList[year][month].append({
|
||||||
|
"time": time,
|
||||||
|
"title": title,
|
||||||
|
"aks": aks,
|
||||||
|
"place": place,
|
||||||
|
"url": url
|
||||||
|
})
|
||||||
|
return jvsWkList
|
||||||
|
|
||||||
|
def parseJvsEvent(jvsEvent, year):
|
||||||
|
'''
|
||||||
|
Parse an event out of the jvsCalendar
|
||||||
|
|
||||||
|
return
|
||||||
|
'''
|
||||||
|
try:
|
||||||
|
aks = None
|
||||||
|
for e in jvsEvent.find_all(class_="col-2"):
|
||||||
|
if e.find("time"):
|
||||||
|
time = [ t.strip() for t in re.sub( "\s+", " ", e.find("time").text.strip() ).split("-")]
|
||||||
|
time = [ t.split(".") for t in time ]
|
||||||
|
time = [ "-".join( [str(year), t[1], t[0]] ) for t in time ]
|
||||||
|
if e.find("span"):
|
||||||
|
aks = [ ak.text.strip() for ak in e.find_all("span") ]
|
||||||
|
aks = [ ak for ak in aks if ak != "" ]
|
||||||
|
if len(aks) == 0:
|
||||||
|
aks=None
|
||||||
|
place = jvsEvent.find(class_="col-3").text.strip()
|
||||||
|
if place == "":
|
||||||
|
place = None
|
||||||
|
title = jvsEvent.find(class_="col-4").find("a").text.strip()
|
||||||
|
url = jvsEvent.find(class_="col-4").find("a")['href']
|
||||||
|
titleFull = jvsEvent.find(class_="col-4").text.strip()
|
||||||
|
assert(title==titleFull)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error '{e}' parsing:")
|
||||||
|
print(jvsEvent.prettify())
|
||||||
|
|
||||||
|
return time, title, aks, place, url
|
||||||
|
|
||||||
|
def download_file(url):
|
||||||
|
local_filename = url.split('/')[-1]
|
||||||
|
# NOTE the stream=True parameter below
|
||||||
|
with requests.get(url, stream=True) as r:
|
||||||
|
r.raise_for_status()
|
||||||
|
with open(local_filename, 'wb') as f:
|
||||||
|
for chunk in r.iter_content(chunk_size=8192):
|
||||||
|
# If you have chunk encoded response uncomment if
|
||||||
|
# and set chunk_size parameter to None.
|
||||||
|
#if chunk:
|
||||||
|
f.write(chunk)
|
||||||
|
return local_filename
|
||||||
|
|
||||||
|
def getWk(urlList):
|
||||||
|
s=requests.Session()
|
||||||
|
return [ wettkampf.from_htmlString(s.get(url).content) for url in urlList]
|
||||||
|
|
||||||
|
class wettkampf:
|
||||||
|
def __init__(self, title, year, month, day, announcement, organizer, place, address, ageGroups):
|
||||||
|
self.title = str(title)
|
||||||
|
self.date = datetime.date(year, month, day)
|
||||||
|
self.announcement = str(announcement)
|
||||||
|
self.organizer = str(organizer)
|
||||||
|
self.place = str(place)
|
||||||
|
self.address = str(address)
|
||||||
|
self.ageGroups = ageGroups
|
||||||
|
return
|
||||||
|
def to_dict(self):
|
||||||
|
return {
|
||||||
|
'title': self.title,
|
||||||
|
'date': self.date.strftime("%Y-%m-%d"),
|
||||||
|
'announcement': self.announcement,
|
||||||
|
'organizer': self.organizer,
|
||||||
|
'place': self.place,
|
||||||
|
'address': self.address,
|
||||||
|
'ageGroups': self.ageGroups
|
||||||
|
}
|
||||||
|
def to_json(self):
|
||||||
|
return json.dumps(self.to_dict())
|
||||||
|
@staticmethod
|
||||||
|
def from_htmlString(wkString, year=None):
|
||||||
|
'''
|
||||||
|
Create a wettkampf from a html string of the event in the
|
||||||
|
calender of the JVS
|
||||||
|
|
||||||
|
the html string is only the wettkampf specific part in the page
|
||||||
|
of the wettkampf. Not the whole page and not the whole calender
|
||||||
|
'''
|
||||||
|
if year is None:
|
||||||
|
year = datetime.date.today().year
|
||||||
|
|
||||||
|
wkSoup = BeautifulSoup(wkString, "html.parser")
|
||||||
|
|
||||||
|
for e in wkSoup.find_all(class_="event-single"):
|
||||||
|
title = e.find("header").text.strip()
|
||||||
|
articleTag= e.find("article")
|
||||||
|
date = [ t.strip() for t in re.sub( "\s+", " ", articleTag.find("time").text.strip() ).split("-")]
|
||||||
|
date = [ t.split(".") for t in date ]
|
||||||
|
|
||||||
|
[announcementDiv, placeDiv, ageGroupsDiv] = articleTag.find_all("div")
|
||||||
|
|
||||||
|
announcement = None
|
||||||
|
for dt, dd in zip(articleTag.find_all("dt"), articleTag.find_all("dd")):
|
||||||
|
if dt.text.strip() == "Ausschreibung:":
|
||||||
|
announcement = dd.find("a")['href']
|
||||||
|
if dt.text.strip() == "Veranstalter:":
|
||||||
|
organizer = dd.text.strip()
|
||||||
|
if dt.text.strip() == "Veranstaltungsort:":
|
||||||
|
place = dd.text.strip()
|
||||||
|
if dt.text.strip() == "Veranstaltungsadresse:":
|
||||||
|
address = re.sub("\s+", " ", dd.text.strip())
|
||||||
|
if announcement is None:
|
||||||
|
print(f"no announcement in {e.prettify()}")
|
||||||
|
|
||||||
|
ageGroups = [ ak.text.strip() for ak in ageGroupsDiv.find_all("span") ]
|
||||||
|
return wettkampf(title, year, int(date[0][1]), int(date[0][0]), announcement, organizer, place, address, ageGroups)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_url(url, year=None):
|
||||||
|
if year is None:
|
||||||
|
year = datetime.date.today().year
|
||||||
|
return wettkampf.from_htmlString( requests.get( url ).content, year )
|
||||||
197
wkOrg/src/wkScraper/wkScraper-JvsCal.py
Executable file
197
wkOrg/src/wkScraper/wkScraper-JvsCal.py
Executable file
@@ -0,0 +1,197 @@
|
|||||||
|
#! /usr/bin/env python3
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
|
||||||
|
import jvsCal
|
||||||
|
|
||||||
|
|
||||||
|
class shiai:
|
||||||
|
def __init__(self, titel, date, ageGroups):
|
||||||
|
self.title = title
|
||||||
|
self.date = date
|
||||||
|
self.ageGroups = ageGroups
|
||||||
|
|
||||||
|
return
|
||||||
|
def toDict(self):
|
||||||
|
return {
|
||||||
|
'title': self.title,
|
||||||
|
'date': self.date,
|
||||||
|
'ageGroups': self.ageGroups,
|
||||||
|
}
|
||||||
|
def toJson(self):
|
||||||
|
return json.dumps(self.toDict())
|
||||||
|
|
||||||
|
class event:
|
||||||
|
'''
|
||||||
|
ein Event besteht aus:
|
||||||
|
- Zeitspanne
|
||||||
|
- Startdatum (+Endatum)
|
||||||
|
- Titel (Name)
|
||||||
|
- Ort
|
||||||
|
- Url
|
||||||
|
'''
|
||||||
|
def __init__(self, timespan, title, place=None, url=None):
|
||||||
|
self.date = datetime.date.fromisoformat( timespan[0] )
|
||||||
|
self.endDate = None
|
||||||
|
if len(timespan) >= 2:
|
||||||
|
self.endDate = datetime.date.fromisoformat( timespan[1] )
|
||||||
|
self.title = title
|
||||||
|
self.place = place
|
||||||
|
self.url = url
|
||||||
|
return
|
||||||
|
def toDict(self):
|
||||||
|
wkDict = {}
|
||||||
|
wkDict["date"] = str(self.date)
|
||||||
|
if self.endDate is not None:
|
||||||
|
wkDict["endDate"] = str(self.endDate)
|
||||||
|
wkDict["title"] = self.title
|
||||||
|
if self.place is not None:
|
||||||
|
wkDict["place"] = self.place
|
||||||
|
if self.url is not None:
|
||||||
|
wkDict["url"] = self.url
|
||||||
|
return wkDict
|
||||||
|
def toJson(self):
|
||||||
|
return json.dumps(self.toDict())
|
||||||
|
|
||||||
|
class wk(event):
|
||||||
|
def __init__(self, timespan, title, akList, place=None, url=None):
|
||||||
|
super().__init__(timespan, title, place, url)
|
||||||
|
self.akList = akList
|
||||||
|
def toDict(self):
|
||||||
|
wkDict = super().toDict()
|
||||||
|
wkDict["akList"] = self.akList
|
||||||
|
return wkDict
|
||||||
|
|
||||||
|
def parseJvsEvent(jvsEvent, onlyIfWithAk=False):
|
||||||
|
try:
|
||||||
|
aks = None
|
||||||
|
for e in jvsEvent.find_all(class_="col-2"):
|
||||||
|
if e.find("time"):
|
||||||
|
time = [ t.strip() for t in re.sub( "\s+", " ", e.find("time").text.strip() ).split("-")]
|
||||||
|
time = [ t.split(".") for t in time ]
|
||||||
|
time = [ "-".join( [year, t[1], t[0]] ) for t in time ]
|
||||||
|
if e.find("span"):
|
||||||
|
aks = [ ak.text.strip() for ak in e.find_all("span") ]
|
||||||
|
aks = [ ak for ak in aks if ak != "" ]
|
||||||
|
if len(aks) == 0:
|
||||||
|
aks=None
|
||||||
|
place = event.find(class_="col-3").text.strip()
|
||||||
|
if place == "":
|
||||||
|
place = None
|
||||||
|
title = event.find(class_="col-4").find("a").text.strip()
|
||||||
|
url = event.find(class_="col-4").find("a")['href']
|
||||||
|
titleFull = event.find(class_="col-4").text.strip()
|
||||||
|
assert(title==titleFull)
|
||||||
|
except:
|
||||||
|
print("Error parsing:")
|
||||||
|
print(event.prettify())
|
||||||
|
|
||||||
|
if aks is not None:
|
||||||
|
return wk(time, title, aks, place, url)
|
||||||
|
else:
|
||||||
|
if onlyIfWithAk:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return event(time, title, place, url)
|
||||||
|
|
||||||
|
|
||||||
|
events = jvsCal.parseJvsCal(minYear=datetime.date.today().year, minMonth=datetime.date.today().month, onlyWithAks=True)
|
||||||
|
#events = jvsCal.parseJvsCal()
|
||||||
|
|
||||||
|
#print( jvsCal.parseJvsCal(minYear=datetime.date.today().year, minMonth=datetime.date.today().month, onlyWithAks=True) )
|
||||||
|
|
||||||
|
print( [ e['url'] for y in events for m in events[y] for e in events[y][m]] )
|
||||||
|
|
||||||
|
print( jvsCal.getWk([ e['url'] for y in events for m in events[y] for e in events[y][m]]) )
|
||||||
|
|
||||||
|
exit()
|
||||||
|
|
||||||
|
jvsCalShiaiUrl = "https://judoverbandsachsen.de/events/23-raeucherkerzenpokal/"
|
||||||
|
|
||||||
|
testWk = jvsCal.wettkampf.from_url( jvsCalShiaiUrl )
|
||||||
|
print(testWk.to_json())
|
||||||
|
exit()
|
||||||
|
|
||||||
|
|
||||||
|
#with open("rkp.html", "w") as f:
|
||||||
|
# f.write(BeautifulSoup(requests.get("https://judoverbandsachsen.de/events/23-raeucherkerzenpokal/").content, "html.parser").prettify())
|
||||||
|
#exit()
|
||||||
|
|
||||||
|
|
||||||
|
url = "https://judoverbandsachsen.de/kalender/?show=all"
|
||||||
|
|
||||||
|
jvsCalShiaiUrl = "https://judoverbandsachsen.de/events/23-raeucherkerzenpokal/"
|
||||||
|
#jvsCalShiaiUrl = "file://rkp.html"
|
||||||
|
#jvsCalShiaiPage = requests.get(jvsCalShiaiUrl)
|
||||||
|
#jvsCalShiaiSoup = BeautifulSoup(jvsCalShiaiPage.content, "html.parser")
|
||||||
|
jvsCalShiaiSoup = BeautifulSoup(open("rkp.html"), "html.parser")
|
||||||
|
|
||||||
|
year = "2020"
|
||||||
|
for e in jvsCalShiaiSoup.find_all(class_="event-single"):
|
||||||
|
print(e.prettify())
|
||||||
|
title = e.find("header").text.strip()
|
||||||
|
articleTag= e.find("article")
|
||||||
|
date = [ t.strip() for t in re.sub( "\s+", " ", articleTag.find("time").text.strip() ).split("-")]
|
||||||
|
date = [ t.split(".") for t in date ]
|
||||||
|
date = [ "-".join( [year, t[1], t[0]] ) for t in date ]
|
||||||
|
|
||||||
|
[announcementDiv, placeDiv, ageGroupsDiv] = articleTag.find_all("div")
|
||||||
|
|
||||||
|
announcement = {}
|
||||||
|
place = {}
|
||||||
|
for dt, dd in zip(articleTag.find_all("dt"), articleTag.find_all("dd")):
|
||||||
|
if dt.text.strip() == "Ausschreibung:":
|
||||||
|
announcement['url'] = dd.find("a")['href']
|
||||||
|
if dt.text.strip() == "Veranstalter:":
|
||||||
|
announcement['organizer'] = dd.text.strip()
|
||||||
|
if dt.text.strip() == "Veranstaltungsort:":
|
||||||
|
place['name'] = dd.text.strip()
|
||||||
|
if dt.text.strip() == "Veranstaltungsadresse:":
|
||||||
|
place['address'] = re.sub("\s+", " ", dd.text.strip())
|
||||||
|
# print(dt.text, dd.text)
|
||||||
|
|
||||||
|
ageGroups = [ ak.text.strip() for ak in ageGroupsDiv.find_all("span") ]
|
||||||
|
|
||||||
|
print(f"title: {title})")
|
||||||
|
print(f"date: {date})")
|
||||||
|
print(f"announcement: {announcement}")
|
||||||
|
print(f"place: {place}")
|
||||||
|
print(f"ageGroups: {ageGroups}")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
|
||||||
|
jvsCalPage = requests.get(url)
|
||||||
|
|
||||||
|
jvsCalSoup = BeautifulSoup(jvsCalPage.content, "html.parser")
|
||||||
|
|
||||||
|
#jvsCalEventList = jvsCalSoup.find(id="eventList")
|
||||||
|
#print(jvsCalEventList.prettify())
|
||||||
|
|
||||||
|
jvsCalEventListItems = jvsCalSoup.find(id="eventListItems")
|
||||||
|
#print(jvsCalEventListItems.prettify())
|
||||||
|
|
||||||
|
jvsCalEventMonts = jvsCalEventListItems.find_all("div", class_="month")
|
||||||
|
|
||||||
|
jvsWkList = []
|
||||||
|
|
||||||
|
for m in jvsCalEventMonts:
|
||||||
|
if m.has_attr("data-month"):
|
||||||
|
yearMonth = m.get("data-month")
|
||||||
|
year = (yearMonth[0:4])
|
||||||
|
month = yearMonth[4:6]
|
||||||
|
print(f"Jahr: {year}, Monat: {month}")
|
||||||
|
events = m.find_all(class_="posts")
|
||||||
|
for event in events:
|
||||||
|
parsedEvent = parseJvsEvent(event, onlyIfWithAk=True)
|
||||||
|
if parsedEvent is not None:
|
||||||
|
jvsWkList.append(parsedEvent)
|
||||||
|
else:
|
||||||
|
print("no data-month")
|
||||||
|
|
||||||
|
for w in jvsWkList:
|
||||||
|
print(w.toJson())
|
||||||
Reference in New Issue
Block a user