Zum Commit vorgemerkte Änderungen:
neue Datei: src/wkScraper/jvsCal.py neue Datei: src/wkScraper/wkScraper-JvsCal.py
This commit is contained in:
165
wkOrg/src/wkScraper/jvsCal.py
Normal file
165
wkOrg/src/wkScraper/jvsCal.py
Normal file
@@ -0,0 +1,165 @@
|
||||
#! /usr/bin/env python3
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import datetime
|
||||
import json
|
||||
import re
|
||||
import requests
|
||||
|
||||
jvsCalUrl = "https://judoverbandsachsen.de/kalender/?show=all"
|
||||
|
||||
def parseJvsCal(url=jvsCalUrl, minYear = 0, minMonth = 0, onlyWithAks=False):
|
||||
'''
|
||||
Parse the calender page of the jvs
|
||||
|
||||
returns dictionary of dictionaries of list cal[year][month] = listOfUrls
|
||||
'''
|
||||
jvsCalPage = requests.get(url)
|
||||
jvsCalSoup = BeautifulSoup(jvsCalPage.content, "html.parser")
|
||||
jvsCalEventListItems = jvsCalSoup.find(id="eventListItems")
|
||||
jvsCalEventMonts = jvsCalEventListItems.find_all("div", class_="month")
|
||||
|
||||
jvsWkList = {}
|
||||
|
||||
for m in jvsCalEventMonts:
|
||||
if m.has_attr("data-month"):
|
||||
yearMonth = m.get("data-month")
|
||||
year = int(yearMonth[0:4])
|
||||
if year < minYear:
|
||||
continue
|
||||
if not year in jvsWkList:
|
||||
jvsWkList[year] = {}
|
||||
month = int(yearMonth[4:6])
|
||||
if year==minYear and month<minMonth:
|
||||
continue
|
||||
if month not in jvsWkList[year]:
|
||||
jvsWkList[year][month] = []
|
||||
events = m.find_all(class_="posts")
|
||||
for event in events:
|
||||
# url = event.find(class_="col-4").find("a")['href']
|
||||
time, title, aks, place, url = parseJvsEvent(event, year)
|
||||
if aks is None and onlyWithAks:
|
||||
continue
|
||||
jvsWkList[year][month].append({
|
||||
"time": time,
|
||||
"title": title,
|
||||
"aks": aks,
|
||||
"place": place,
|
||||
"url": url
|
||||
})
|
||||
return jvsWkList
|
||||
|
||||
def parseJvsEvent(jvsEvent, year):
|
||||
'''
|
||||
Parse an event out of the jvsCalendar
|
||||
|
||||
return
|
||||
'''
|
||||
try:
|
||||
aks = None
|
||||
for e in jvsEvent.find_all(class_="col-2"):
|
||||
if e.find("time"):
|
||||
time = [ t.strip() for t in re.sub( "\s+", " ", e.find("time").text.strip() ).split("-")]
|
||||
time = [ t.split(".") for t in time ]
|
||||
time = [ "-".join( [str(year), t[1], t[0]] ) for t in time ]
|
||||
if e.find("span"):
|
||||
aks = [ ak.text.strip() for ak in e.find_all("span") ]
|
||||
aks = [ ak for ak in aks if ak != "" ]
|
||||
if len(aks) == 0:
|
||||
aks=None
|
||||
place = jvsEvent.find(class_="col-3").text.strip()
|
||||
if place == "":
|
||||
place = None
|
||||
title = jvsEvent.find(class_="col-4").find("a").text.strip()
|
||||
url = jvsEvent.find(class_="col-4").find("a")['href']
|
||||
titleFull = jvsEvent.find(class_="col-4").text.strip()
|
||||
assert(title==titleFull)
|
||||
except Exception as e:
|
||||
print(f"Error '{e}' parsing:")
|
||||
print(jvsEvent.prettify())
|
||||
|
||||
return time, title, aks, place, url
|
||||
|
||||
def download_file(url):
|
||||
local_filename = url.split('/')[-1]
|
||||
# NOTE the stream=True parameter below
|
||||
with requests.get(url, stream=True) as r:
|
||||
r.raise_for_status()
|
||||
with open(local_filename, 'wb') as f:
|
||||
for chunk in r.iter_content(chunk_size=8192):
|
||||
# If you have chunk encoded response uncomment if
|
||||
# and set chunk_size parameter to None.
|
||||
#if chunk:
|
||||
f.write(chunk)
|
||||
return local_filename
|
||||
|
||||
def getWk(urlList):
|
||||
s=requests.Session()
|
||||
return [ wettkampf.from_htmlString(s.get(url).content) for url in urlList]
|
||||
|
||||
class wettkampf:
|
||||
def __init__(self, title, year, month, day, announcement, organizer, place, address, ageGroups):
|
||||
self.title = str(title)
|
||||
self.date = datetime.date(year, month, day)
|
||||
self.announcement = str(announcement)
|
||||
self.organizer = str(organizer)
|
||||
self.place = str(place)
|
||||
self.address = str(address)
|
||||
self.ageGroups = ageGroups
|
||||
return
|
||||
def to_dict(self):
|
||||
return {
|
||||
'title': self.title,
|
||||
'date': self.date.strftime("%Y-%m-%d"),
|
||||
'announcement': self.announcement,
|
||||
'organizer': self.organizer,
|
||||
'place': self.place,
|
||||
'address': self.address,
|
||||
'ageGroups': self.ageGroups
|
||||
}
|
||||
def to_json(self):
|
||||
return json.dumps(self.to_dict())
|
||||
@staticmethod
|
||||
def from_htmlString(wkString, year=None):
|
||||
'''
|
||||
Create a wettkampf from a html string of the event in the
|
||||
calender of the JVS
|
||||
|
||||
the html string is only the wettkampf specific part in the page
|
||||
of the wettkampf. Not the whole page and not the whole calender
|
||||
'''
|
||||
if year is None:
|
||||
year = datetime.date.today().year
|
||||
|
||||
wkSoup = BeautifulSoup(wkString, "html.parser")
|
||||
|
||||
for e in wkSoup.find_all(class_="event-single"):
|
||||
title = e.find("header").text.strip()
|
||||
articleTag= e.find("article")
|
||||
date = [ t.strip() for t in re.sub( "\s+", " ", articleTag.find("time").text.strip() ).split("-")]
|
||||
date = [ t.split(".") for t in date ]
|
||||
|
||||
[announcementDiv, placeDiv, ageGroupsDiv] = articleTag.find_all("div")
|
||||
|
||||
announcement = None
|
||||
for dt, dd in zip(articleTag.find_all("dt"), articleTag.find_all("dd")):
|
||||
if dt.text.strip() == "Ausschreibung:":
|
||||
announcement = dd.find("a")['href']
|
||||
if dt.text.strip() == "Veranstalter:":
|
||||
organizer = dd.text.strip()
|
||||
if dt.text.strip() == "Veranstaltungsort:":
|
||||
place = dd.text.strip()
|
||||
if dt.text.strip() == "Veranstaltungsadresse:":
|
||||
address = re.sub("\s+", " ", dd.text.strip())
|
||||
if announcement is None:
|
||||
print(f"no announcement in {e.prettify()}")
|
||||
|
||||
ageGroups = [ ak.text.strip() for ak in ageGroupsDiv.find_all("span") ]
|
||||
return wettkampf(title, year, int(date[0][1]), int(date[0][0]), announcement, organizer, place, address, ageGroups)
|
||||
|
||||
@staticmethod
|
||||
def from_url(url, year=None):
|
||||
if year is None:
|
||||
year = datetime.date.today().year
|
||||
return wettkampf.from_htmlString( requests.get( url ).content, year )
|
||||
197
wkOrg/src/wkScraper/wkScraper-JvsCal.py
Executable file
197
wkOrg/src/wkScraper/wkScraper-JvsCal.py
Executable file
@@ -0,0 +1,197 @@
|
||||
#! /usr/bin/env python3
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
import re
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import datetime
|
||||
import json
|
||||
|
||||
import jvsCal
|
||||
|
||||
|
||||
class shiai:
|
||||
def __init__(self, titel, date, ageGroups):
|
||||
self.title = title
|
||||
self.date = date
|
||||
self.ageGroups = ageGroups
|
||||
|
||||
return
|
||||
def toDict(self):
|
||||
return {
|
||||
'title': self.title,
|
||||
'date': self.date,
|
||||
'ageGroups': self.ageGroups,
|
||||
}
|
||||
def toJson(self):
|
||||
return json.dumps(self.toDict())
|
||||
|
||||
class event:
|
||||
'''
|
||||
ein Event besteht aus:
|
||||
- Zeitspanne
|
||||
- Startdatum (+Endatum)
|
||||
- Titel (Name)
|
||||
- Ort
|
||||
- Url
|
||||
'''
|
||||
def __init__(self, timespan, title, place=None, url=None):
|
||||
self.date = datetime.date.fromisoformat( timespan[0] )
|
||||
self.endDate = None
|
||||
if len(timespan) >= 2:
|
||||
self.endDate = datetime.date.fromisoformat( timespan[1] )
|
||||
self.title = title
|
||||
self.place = place
|
||||
self.url = url
|
||||
return
|
||||
def toDict(self):
|
||||
wkDict = {}
|
||||
wkDict["date"] = str(self.date)
|
||||
if self.endDate is not None:
|
||||
wkDict["endDate"] = str(self.endDate)
|
||||
wkDict["title"] = self.title
|
||||
if self.place is not None:
|
||||
wkDict["place"] = self.place
|
||||
if self.url is not None:
|
||||
wkDict["url"] = self.url
|
||||
return wkDict
|
||||
def toJson(self):
|
||||
return json.dumps(self.toDict())
|
||||
|
||||
class wk(event):
|
||||
def __init__(self, timespan, title, akList, place=None, url=None):
|
||||
super().__init__(timespan, title, place, url)
|
||||
self.akList = akList
|
||||
def toDict(self):
|
||||
wkDict = super().toDict()
|
||||
wkDict["akList"] = self.akList
|
||||
return wkDict
|
||||
|
||||
def parseJvsEvent(jvsEvent, onlyIfWithAk=False):
|
||||
try:
|
||||
aks = None
|
||||
for e in jvsEvent.find_all(class_="col-2"):
|
||||
if e.find("time"):
|
||||
time = [ t.strip() for t in re.sub( "\s+", " ", e.find("time").text.strip() ).split("-")]
|
||||
time = [ t.split(".") for t in time ]
|
||||
time = [ "-".join( [year, t[1], t[0]] ) for t in time ]
|
||||
if e.find("span"):
|
||||
aks = [ ak.text.strip() for ak in e.find_all("span") ]
|
||||
aks = [ ak for ak in aks if ak != "" ]
|
||||
if len(aks) == 0:
|
||||
aks=None
|
||||
place = event.find(class_="col-3").text.strip()
|
||||
if place == "":
|
||||
place = None
|
||||
title = event.find(class_="col-4").find("a").text.strip()
|
||||
url = event.find(class_="col-4").find("a")['href']
|
||||
titleFull = event.find(class_="col-4").text.strip()
|
||||
assert(title==titleFull)
|
||||
except:
|
||||
print("Error parsing:")
|
||||
print(event.prettify())
|
||||
|
||||
if aks is not None:
|
||||
return wk(time, title, aks, place, url)
|
||||
else:
|
||||
if onlyIfWithAk:
|
||||
return None
|
||||
else:
|
||||
return event(time, title, place, url)
|
||||
|
||||
|
||||
events = jvsCal.parseJvsCal(minYear=datetime.date.today().year, minMonth=datetime.date.today().month, onlyWithAks=True)
|
||||
#events = jvsCal.parseJvsCal()
|
||||
|
||||
#print( jvsCal.parseJvsCal(minYear=datetime.date.today().year, minMonth=datetime.date.today().month, onlyWithAks=True) )
|
||||
|
||||
print( [ e['url'] for y in events for m in events[y] for e in events[y][m]] )
|
||||
|
||||
print( jvsCal.getWk([ e['url'] for y in events for m in events[y] for e in events[y][m]]) )
|
||||
|
||||
exit()
|
||||
|
||||
jvsCalShiaiUrl = "https://judoverbandsachsen.de/events/23-raeucherkerzenpokal/"
|
||||
|
||||
testWk = jvsCal.wettkampf.from_url( jvsCalShiaiUrl )
|
||||
print(testWk.to_json())
|
||||
exit()
|
||||
|
||||
|
||||
#with open("rkp.html", "w") as f:
|
||||
# f.write(BeautifulSoup(requests.get("https://judoverbandsachsen.de/events/23-raeucherkerzenpokal/").content, "html.parser").prettify())
|
||||
#exit()
|
||||
|
||||
|
||||
url = "https://judoverbandsachsen.de/kalender/?show=all"
|
||||
|
||||
jvsCalShiaiUrl = "https://judoverbandsachsen.de/events/23-raeucherkerzenpokal/"
|
||||
#jvsCalShiaiUrl = "file://rkp.html"
|
||||
#jvsCalShiaiPage = requests.get(jvsCalShiaiUrl)
|
||||
#jvsCalShiaiSoup = BeautifulSoup(jvsCalShiaiPage.content, "html.parser")
|
||||
jvsCalShiaiSoup = BeautifulSoup(open("rkp.html"), "html.parser")
|
||||
|
||||
year = "2020"
|
||||
for e in jvsCalShiaiSoup.find_all(class_="event-single"):
|
||||
print(e.prettify())
|
||||
title = e.find("header").text.strip()
|
||||
articleTag= e.find("article")
|
||||
date = [ t.strip() for t in re.sub( "\s+", " ", articleTag.find("time").text.strip() ).split("-")]
|
||||
date = [ t.split(".") for t in date ]
|
||||
date = [ "-".join( [year, t[1], t[0]] ) for t in date ]
|
||||
|
||||
[announcementDiv, placeDiv, ageGroupsDiv] = articleTag.find_all("div")
|
||||
|
||||
announcement = {}
|
||||
place = {}
|
||||
for dt, dd in zip(articleTag.find_all("dt"), articleTag.find_all("dd")):
|
||||
if dt.text.strip() == "Ausschreibung:":
|
||||
announcement['url'] = dd.find("a")['href']
|
||||
if dt.text.strip() == "Veranstalter:":
|
||||
announcement['organizer'] = dd.text.strip()
|
||||
if dt.text.strip() == "Veranstaltungsort:":
|
||||
place['name'] = dd.text.strip()
|
||||
if dt.text.strip() == "Veranstaltungsadresse:":
|
||||
place['address'] = re.sub("\s+", " ", dd.text.strip())
|
||||
# print(dt.text, dd.text)
|
||||
|
||||
ageGroups = [ ak.text.strip() for ak in ageGroupsDiv.find_all("span") ]
|
||||
|
||||
print(f"title: {title})")
|
||||
print(f"date: {date})")
|
||||
print(f"announcement: {announcement}")
|
||||
print(f"place: {place}")
|
||||
print(f"ageGroups: {ageGroups}")
|
||||
exit()
|
||||
|
||||
|
||||
jvsCalPage = requests.get(url)
|
||||
|
||||
jvsCalSoup = BeautifulSoup(jvsCalPage.content, "html.parser")
|
||||
|
||||
#jvsCalEventList = jvsCalSoup.find(id="eventList")
|
||||
#print(jvsCalEventList.prettify())
|
||||
|
||||
jvsCalEventListItems = jvsCalSoup.find(id="eventListItems")
|
||||
#print(jvsCalEventListItems.prettify())
|
||||
|
||||
jvsCalEventMonts = jvsCalEventListItems.find_all("div", class_="month")
|
||||
|
||||
jvsWkList = []
|
||||
|
||||
for m in jvsCalEventMonts:
|
||||
if m.has_attr("data-month"):
|
||||
yearMonth = m.get("data-month")
|
||||
year = (yearMonth[0:4])
|
||||
month = yearMonth[4:6]
|
||||
print(f"Jahr: {year}, Monat: {month}")
|
||||
events = m.find_all(class_="posts")
|
||||
for event in events:
|
||||
parsedEvent = parseJvsEvent(event, onlyIfWithAk=True)
|
||||
if parsedEvent is not None:
|
||||
jvsWkList.append(parsedEvent)
|
||||
else:
|
||||
print("no data-month")
|
||||
|
||||
for w in jvsWkList:
|
||||
print(w.toJson())
|
||||
Reference in New Issue
Block a user