Files
cwsvJudo/wkOrg/src/wkScraper/jvsCal.py
2021-10-17 12:17:54 +02:00

166 lines
5.2 KiB
Python

#! /usr/bin/env python3
# -*- coding: UTF-8 -*-
import requests
from bs4 import BeautifulSoup
import datetime
import json
import re
jvsCalUrl = "https://judoverbandsachsen.de/kalender/?show=all"
def parseJvsCal(url=jvsCalUrl, minYear = 0, minMonth = 0, onlyWithAks=False):
'''
Parse the calender page of the jvs
returns dictionary of dictionaries of list cal[year][month] = listOfUrls
'''
jvsCalPage = requests.get(url)
jvsCalSoup = BeautifulSoup(jvsCalPage.content, "html.parser")
jvsCalEventListItems = jvsCalSoup.find(id="eventListItems")
jvsCalEventMonts = jvsCalEventListItems.find_all("div", class_="month")
jvsWkList = {}
for m in jvsCalEventMonts:
if m.has_attr("data-month"):
yearMonth = m.get("data-month")
year = int(yearMonth[0:4])
if year < minYear:
continue
if not year in jvsWkList:
jvsWkList[year] = {}
month = int(yearMonth[4:6])
if year==minYear and month<minMonth:
continue
if month not in jvsWkList[year]:
jvsWkList[year][month] = []
events = m.find_all(class_="posts")
for event in events:
# url = event.find(class_="col-4").find("a")['href']
time, title, aks, place, url = parseJvsEvent(event, year)
if aks is None and onlyWithAks:
continue
jvsWkList[year][month].append({
"time": time,
"title": title,
"aks": aks,
"place": place,
"url": url
})
return jvsWkList
def parseJvsEvent(jvsEvent, year):
'''
Parse an event out of the jvsCalendar
return
'''
try:
aks = None
for e in jvsEvent.find_all(class_="col-2"):
if e.find("time"):
time = [ t.strip() for t in re.sub( "\s+", " ", e.find("time").text.strip() ).split("-")]
time = [ t.split(".") for t in time ]
time = [ "-".join( [str(year), t[1], t[0]] ) for t in time ]
if e.find("span"):
aks = [ ak.text.strip() for ak in e.find_all("span") ]
aks = [ ak for ak in aks if ak != "" ]
if len(aks) == 0:
aks=None
place = jvsEvent.find(class_="col-3").text.strip()
if place == "":
place = None
title = jvsEvent.find(class_="col-4").find("a").text.strip()
url = jvsEvent.find(class_="col-4").find("a")['href']
titleFull = jvsEvent.find(class_="col-4").text.strip()
assert(title==titleFull)
except Exception as e:
print(f"Error '{e}' parsing:")
print(jvsEvent.prettify())
return time, title, aks, place, url
def download_file(url):
local_filename = url.split('/')[-1]
# NOTE the stream=True parameter below
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
# If you have chunk encoded response uncomment if
# and set chunk_size parameter to None.
#if chunk:
f.write(chunk)
return local_filename
def getWk(urlList):
s=requests.Session()
return [ wettkampf.from_htmlString(s.get(url).content) for url in urlList]
class wettkampf:
def __init__(self, title, year, month, day, announcement, organizer, place, address, ageGroups):
self.title = str(title)
self.date = datetime.date(year, month, day)
self.announcement = str(announcement)
self.organizer = str(organizer)
self.place = str(place)
self.address = str(address)
self.ageGroups = ageGroups
return
def to_dict(self):
return {
'title': self.title,
'date': self.date.strftime("%Y-%m-%d"),
'announcement': self.announcement,
'organizer': self.organizer,
'place': self.place,
'address': self.address,
'ageGroups': self.ageGroups
}
def to_json(self, indent=2):
return json.dumps(self.to_dict(), indent=indent)
@staticmethod
def from_htmlString(wkString, year=None):
'''
Create a wettkampf from a html string of the event in the
calender of the JVS
the html string is only the wettkampf specific part in the page
of the wettkampf. Not the whole page and not the whole calender
'''
if year is None:
year = datetime.date.today().year
wkSoup = BeautifulSoup(wkString, "html.parser")
for e in wkSoup.find_all(class_="event-single"):
title = e.find("header").text.strip()
articleTag= e.find("article")
date = [ t.strip() for t in re.sub( "\s+", " ", articleTag.find("time").text.strip() ).split("-")]
date = [ t.split(".") for t in date ]
[announcementDiv, placeDiv, ageGroupsDiv] = articleTag.find_all("div")
announcement = None
for dt, dd in zip(articleTag.find_all("dt"), articleTag.find_all("dd")):
if dt.text.strip() == "Ausschreibung:":
announcement = dd.find("a")['href']
if dt.text.strip() == "Veranstalter:":
organizer = dd.text.strip()
if dt.text.strip() == "Veranstaltungsort:":
place = dd.text.strip()
if dt.text.strip() == "Veranstaltungsadresse:":
address = re.sub("\s+", " ", dd.text.strip())
if announcement is None:
print(f"no announcement in {e.prettify()}")
ageGroups = [ ak.text.strip() for ak in ageGroupsDiv.find_all("span") ]
return wettkampf(title, year, int(date[0][1]), int(date[0][0]), announcement, organizer, place, address, ageGroups)
@staticmethod
def from_url(url, year=None):
if year is None:
year = datetime.date.today().year
return wettkampf.from_htmlString( requests.get( url ).content, year )