Zum Commit vorgemerkte Änderungen:

neue Datei:     src/wkScraper/jvsCal.py
	neue Datei:     src/wkScraper/wkScraper-JvsCal.py
This commit is contained in:
marko
2020-10-11 09:40:27 +02:00
parent 4b1c208bfe
commit 7486a9e886
2 changed files with 362 additions and 0 deletions

View File

@@ -0,0 +1,165 @@
#! /usr/bin/env python3
# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
import datetime
import json
import re
import requests
jvsCalUrl = "https://judoverbandsachsen.de/kalender/?show=all"
def parseJvsCal(url=jvsCalUrl, minYear = 0, minMonth = 0, onlyWithAks=False):
'''
Parse the calender page of the jvs
returns dictionary of dictionaries of list cal[year][month] = listOfUrls
'''
jvsCalPage = requests.get(url)
jvsCalSoup = BeautifulSoup(jvsCalPage.content, "html.parser")
jvsCalEventListItems = jvsCalSoup.find(id="eventListItems")
jvsCalEventMonts = jvsCalEventListItems.find_all("div", class_="month")
jvsWkList = {}
for m in jvsCalEventMonts:
if m.has_attr("data-month"):
yearMonth = m.get("data-month")
year = int(yearMonth[0:4])
if year < minYear:
continue
if not year in jvsWkList:
jvsWkList[year] = {}
month = int(yearMonth[4:6])
if year==minYear and month<minMonth:
continue
if month not in jvsWkList[year]:
jvsWkList[year][month] = []
events = m.find_all(class_="posts")
for event in events:
# url = event.find(class_="col-4").find("a")['href']
time, title, aks, place, url = parseJvsEvent(event, year)
if aks is None and onlyWithAks:
continue
jvsWkList[year][month].append({
"time": time,
"title": title,
"aks": aks,
"place": place,
"url": url
})
return jvsWkList
def parseJvsEvent(jvsEvent, year):
'''
Parse an event out of the jvsCalendar
return
'''
try:
aks = None
for e in jvsEvent.find_all(class_="col-2"):
if e.find("time"):
time = [ t.strip() for t in re.sub( "\s+", " ", e.find("time").text.strip() ).split("-")]
time = [ t.split(".") for t in time ]
time = [ "-".join( [str(year), t[1], t[0]] ) for t in time ]
if e.find("span"):
aks = [ ak.text.strip() for ak in e.find_all("span") ]
aks = [ ak for ak in aks if ak != "" ]
if len(aks) == 0:
aks=None
place = jvsEvent.find(class_="col-3").text.strip()
if place == "":
place = None
title = jvsEvent.find(class_="col-4").find("a").text.strip()
url = jvsEvent.find(class_="col-4").find("a")['href']
titleFull = jvsEvent.find(class_="col-4").text.strip()
assert(title==titleFull)
except Exception as e:
print(f"Error '{e}' parsing:")
print(jvsEvent.prettify())
return time, title, aks, place, url
def download_file(url):
local_filename = url.split('/')[-1]
# NOTE the stream=True parameter below
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
# If you have chunk encoded response uncomment if
# and set chunk_size parameter to None.
#if chunk:
f.write(chunk)
return local_filename
def getWk(urlList):
s=requests.Session()
return [ wettkampf.from_htmlString(s.get(url).content) for url in urlList]
class wettkampf:
def __init__(self, title, year, month, day, announcement, organizer, place, address, ageGroups):
self.title = str(title)
self.date = datetime.date(year, month, day)
self.announcement = str(announcement)
self.organizer = str(organizer)
self.place = str(place)
self.address = str(address)
self.ageGroups = ageGroups
return
def to_dict(self):
return {
'title': self.title,
'date': self.date.strftime("%Y-%m-%d"),
'announcement': self.announcement,
'organizer': self.organizer,
'place': self.place,
'address': self.address,
'ageGroups': self.ageGroups
}
def to_json(self):
return json.dumps(self.to_dict())
@staticmethod
def from_htmlString(wkString, year=None):
'''
Create a wettkampf from a html string of the event in the
calender of the JVS
the html string is only the wettkampf specific part in the page
of the wettkampf. Not the whole page and not the whole calender
'''
if year is None:
year = datetime.date.today().year
wkSoup = BeautifulSoup(wkString, "html.parser")
for e in wkSoup.find_all(class_="event-single"):
title = e.find("header").text.strip()
articleTag= e.find("article")
date = [ t.strip() for t in re.sub( "\s+", " ", articleTag.find("time").text.strip() ).split("-")]
date = [ t.split(".") for t in date ]
[announcementDiv, placeDiv, ageGroupsDiv] = articleTag.find_all("div")
announcement = None
for dt, dd in zip(articleTag.find_all("dt"), articleTag.find_all("dd")):
if dt.text.strip() == "Ausschreibung:":
announcement = dd.find("a")['href']
if dt.text.strip() == "Veranstalter:":
organizer = dd.text.strip()
if dt.text.strip() == "Veranstaltungsort:":
place = dd.text.strip()
if dt.text.strip() == "Veranstaltungsadresse:":
address = re.sub("\s+", " ", dd.text.strip())
if announcement is None:
print(f"no announcement in {e.prettify()}")
ageGroups = [ ak.text.strip() for ak in ageGroupsDiv.find_all("span") ]
return wettkampf(title, year, int(date[0][1]), int(date[0][0]), announcement, organizer, place, address, ageGroups)
@staticmethod
def from_url(url, year=None):
if year is None:
year = datetime.date.today().year
return wettkampf.from_htmlString( requests.get( url ).content, year )

View File

@@ -0,0 +1,197 @@
#! /usr/bin/env python3
# -*- coding: UTF-8 -*-
import re
import requests
from bs4 import BeautifulSoup
import datetime
import json
import jvsCal
class shiai:
def __init__(self, titel, date, ageGroups):
self.title = title
self.date = date
self.ageGroups = ageGroups
return
def toDict(self):
return {
'title': self.title,
'date': self.date,
'ageGroups': self.ageGroups,
}
def toJson(self):
return json.dumps(self.toDict())
class event:
'''
ein Event besteht aus:
- Zeitspanne
- Startdatum (+Endatum)
- Titel (Name)
- Ort
- Url
'''
def __init__(self, timespan, title, place=None, url=None):
self.date = datetime.date.fromisoformat( timespan[0] )
self.endDate = None
if len(timespan) >= 2:
self.endDate = datetime.date.fromisoformat( timespan[1] )
self.title = title
self.place = place
self.url = url
return
def toDict(self):
wkDict = {}
wkDict["date"] = str(self.date)
if self.endDate is not None:
wkDict["endDate"] = str(self.endDate)
wkDict["title"] = self.title
if self.place is not None:
wkDict["place"] = self.place
if self.url is not None:
wkDict["url"] = self.url
return wkDict
def toJson(self):
return json.dumps(self.toDict())
class wk(event):
def __init__(self, timespan, title, akList, place=None, url=None):
super().__init__(timespan, title, place, url)
self.akList = akList
def toDict(self):
wkDict = super().toDict()
wkDict["akList"] = self.akList
return wkDict
def parseJvsEvent(jvsEvent, onlyIfWithAk=False):
try:
aks = None
for e in jvsEvent.find_all(class_="col-2"):
if e.find("time"):
time = [ t.strip() for t in re.sub( "\s+", " ", e.find("time").text.strip() ).split("-")]
time = [ t.split(".") for t in time ]
time = [ "-".join( [year, t[1], t[0]] ) for t in time ]
if e.find("span"):
aks = [ ak.text.strip() for ak in e.find_all("span") ]
aks = [ ak for ak in aks if ak != "" ]
if len(aks) == 0:
aks=None
place = event.find(class_="col-3").text.strip()
if place == "":
place = None
title = event.find(class_="col-4").find("a").text.strip()
url = event.find(class_="col-4").find("a")['href']
titleFull = event.find(class_="col-4").text.strip()
assert(title==titleFull)
except:
print("Error parsing:")
print(event.prettify())
if aks is not None:
return wk(time, title, aks, place, url)
else:
if onlyIfWithAk:
return None
else:
return event(time, title, place, url)
events = jvsCal.parseJvsCal(minYear=datetime.date.today().year, minMonth=datetime.date.today().month, onlyWithAks=True)
#events = jvsCal.parseJvsCal()
#print( jvsCal.parseJvsCal(minYear=datetime.date.today().year, minMonth=datetime.date.today().month, onlyWithAks=True) )
print( [ e['url'] for y in events for m in events[y] for e in events[y][m]] )
print( jvsCal.getWk([ e['url'] for y in events for m in events[y] for e in events[y][m]]) )
exit()
jvsCalShiaiUrl = "https://judoverbandsachsen.de/events/23-raeucherkerzenpokal/"
testWk = jvsCal.wettkampf.from_url( jvsCalShiaiUrl )
print(testWk.to_json())
exit()
#with open("rkp.html", "w") as f:
# f.write(BeautifulSoup(requests.get("https://judoverbandsachsen.de/events/23-raeucherkerzenpokal/").content, "html.parser").prettify())
#exit()
url = "https://judoverbandsachsen.de/kalender/?show=all"
jvsCalShiaiUrl = "https://judoverbandsachsen.de/events/23-raeucherkerzenpokal/"
#jvsCalShiaiUrl = "file://rkp.html"
#jvsCalShiaiPage = requests.get(jvsCalShiaiUrl)
#jvsCalShiaiSoup = BeautifulSoup(jvsCalShiaiPage.content, "html.parser")
jvsCalShiaiSoup = BeautifulSoup(open("rkp.html"), "html.parser")
year = "2020"
for e in jvsCalShiaiSoup.find_all(class_="event-single"):
print(e.prettify())
title = e.find("header").text.strip()
articleTag= e.find("article")
date = [ t.strip() for t in re.sub( "\s+", " ", articleTag.find("time").text.strip() ).split("-")]
date = [ t.split(".") for t in date ]
date = [ "-".join( [year, t[1], t[0]] ) for t in date ]
[announcementDiv, placeDiv, ageGroupsDiv] = articleTag.find_all("div")
announcement = {}
place = {}
for dt, dd in zip(articleTag.find_all("dt"), articleTag.find_all("dd")):
if dt.text.strip() == "Ausschreibung:":
announcement['url'] = dd.find("a")['href']
if dt.text.strip() == "Veranstalter:":
announcement['organizer'] = dd.text.strip()
if dt.text.strip() == "Veranstaltungsort:":
place['name'] = dd.text.strip()
if dt.text.strip() == "Veranstaltungsadresse:":
place['address'] = re.sub("\s+", " ", dd.text.strip())
# print(dt.text, dd.text)
ageGroups = [ ak.text.strip() for ak in ageGroupsDiv.find_all("span") ]
print(f"title: {title})")
print(f"date: {date})")
print(f"announcement: {announcement}")
print(f"place: {place}")
print(f"ageGroups: {ageGroups}")
exit()
jvsCalPage = requests.get(url)
jvsCalSoup = BeautifulSoup(jvsCalPage.content, "html.parser")
#jvsCalEventList = jvsCalSoup.find(id="eventList")
#print(jvsCalEventList.prettify())
jvsCalEventListItems = jvsCalSoup.find(id="eventListItems")
#print(jvsCalEventListItems.prettify())
jvsCalEventMonts = jvsCalEventListItems.find_all("div", class_="month")
jvsWkList = []
for m in jvsCalEventMonts:
if m.has_attr("data-month"):
yearMonth = m.get("data-month")
year = (yearMonth[0:4])
month = yearMonth[4:6]
print(f"Jahr: {year}, Monat: {month}")
events = m.find_all(class_="posts")
for event in events:
parsedEvent = parseJvsEvent(event, onlyIfWithAk=True)
if parsedEvent is not None:
jvsWkList.append(parsedEvent)
else:
print("no data-month")
for w in jvsWkList:
print(w.toJson())