166 lines
5.2 KiB
Python
166 lines
5.2 KiB
Python
#! /usr/bin/env python3
|
|
# -*- coding: UTF-8 -*-
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import datetime
|
|
import json
|
|
import re
|
|
|
|
jvsCalUrl = "https://judoverbandsachsen.de/kalender/?show=all"
|
|
|
|
def parseJvsCal(url=jvsCalUrl, minYear = 0, minMonth = 0, onlyWithAks=False):
|
|
'''
|
|
Parse the calender page of the jvs
|
|
|
|
returns dictionary of dictionaries of list cal[year][month] = listOfUrls
|
|
'''
|
|
jvsCalPage = requests.get(url)
|
|
jvsCalSoup = BeautifulSoup(jvsCalPage.content, "html.parser")
|
|
jvsCalEventListItems = jvsCalSoup.find(id="eventListItems")
|
|
jvsCalEventMonts = jvsCalEventListItems.find_all("div", class_="month")
|
|
|
|
jvsWkList = {}
|
|
|
|
for m in jvsCalEventMonts:
|
|
if m.has_attr("data-month"):
|
|
yearMonth = m.get("data-month")
|
|
year = int(yearMonth[0:4])
|
|
if year < minYear:
|
|
continue
|
|
if not year in jvsWkList:
|
|
jvsWkList[year] = {}
|
|
month = int(yearMonth[4:6])
|
|
if year==minYear and month<minMonth:
|
|
continue
|
|
if month not in jvsWkList[year]:
|
|
jvsWkList[year][month] = []
|
|
events = m.find_all(class_="posts")
|
|
for event in events:
|
|
# url = event.find(class_="col-4").find("a")['href']
|
|
time, title, aks, place, url = parseJvsEvent(event, year)
|
|
if aks is None and onlyWithAks:
|
|
continue
|
|
jvsWkList[year][month].append({
|
|
"time": time,
|
|
"title": title,
|
|
"aks": aks,
|
|
"place": place,
|
|
"url": url
|
|
})
|
|
return jvsWkList
|
|
|
|
def parseJvsEvent(jvsEvent, year):
|
|
'''
|
|
Parse an event out of the jvsCalendar
|
|
|
|
return
|
|
'''
|
|
try:
|
|
aks = None
|
|
for e in jvsEvent.find_all(class_="col-2"):
|
|
if e.find("time"):
|
|
time = [ t.strip() for t in re.sub( "\s+", " ", e.find("time").text.strip() ).split("-")]
|
|
time = [ t.split(".") for t in time ]
|
|
time = [ "-".join( [str(year), t[1], t[0]] ) for t in time ]
|
|
if e.find("span"):
|
|
aks = [ ak.text.strip() for ak in e.find_all("span") ]
|
|
aks = [ ak for ak in aks if ak != "" ]
|
|
if len(aks) == 0:
|
|
aks=None
|
|
place = jvsEvent.find(class_="col-3").text.strip()
|
|
if place == "":
|
|
place = None
|
|
title = jvsEvent.find(class_="col-4").find("a").text.strip()
|
|
url = jvsEvent.find(class_="col-4").find("a")['href']
|
|
titleFull = jvsEvent.find(class_="col-4").text.strip()
|
|
assert(title==titleFull)
|
|
except Exception as e:
|
|
print(f"Error '{e}' parsing:")
|
|
print(jvsEvent.prettify())
|
|
|
|
return time, title, aks, place, url
|
|
|
|
def download_file(url):
|
|
local_filename = url.split('/')[-1]
|
|
# NOTE the stream=True parameter below
|
|
with requests.get(url, stream=True) as r:
|
|
r.raise_for_status()
|
|
with open(local_filename, 'wb') as f:
|
|
for chunk in r.iter_content(chunk_size=8192):
|
|
# If you have chunk encoded response uncomment if
|
|
# and set chunk_size parameter to None.
|
|
#if chunk:
|
|
f.write(chunk)
|
|
return local_filename
|
|
|
|
def getWk(urlList):
|
|
s=requests.Session()
|
|
return [ wettkampf.from_htmlString(s.get(url).content) for url in urlList]
|
|
|
|
class wettkampf:
|
|
def __init__(self, title, year, month, day, announcement, organizer, place, address, ageGroups):
|
|
self.title = str(title)
|
|
self.date = datetime.date(year, month, day)
|
|
self.announcement = str(announcement)
|
|
self.organizer = str(organizer)
|
|
self.place = str(place)
|
|
self.address = str(address)
|
|
self.ageGroups = ageGroups
|
|
return
|
|
def to_dict(self):
|
|
return {
|
|
'title': self.title,
|
|
'date': self.date.strftime("%Y-%m-%d"),
|
|
'announcement': self.announcement,
|
|
'organizer': self.organizer,
|
|
'place': self.place,
|
|
'address': self.address,
|
|
'ageGroups': self.ageGroups
|
|
}
|
|
def to_json(self, indent=2):
|
|
return json.dumps(self.to_dict(), indent=indent)
|
|
@staticmethod
|
|
def from_htmlString(wkString, year=None):
|
|
'''
|
|
Create a wettkampf from a html string of the event in the
|
|
calender of the JVS
|
|
|
|
the html string is only the wettkampf specific part in the page
|
|
of the wettkampf. Not the whole page and not the whole calender
|
|
'''
|
|
if year is None:
|
|
year = datetime.date.today().year
|
|
|
|
wkSoup = BeautifulSoup(wkString, "html.parser")
|
|
|
|
for e in wkSoup.find_all(class_="event-single"):
|
|
title = e.find("header").text.strip()
|
|
articleTag= e.find("article")
|
|
date = [ t.strip() for t in re.sub( "\s+", " ", articleTag.find("time").text.strip() ).split("-")]
|
|
date = [ t.split(".") for t in date ]
|
|
|
|
[announcementDiv, placeDiv, ageGroupsDiv] = articleTag.find_all("div")
|
|
|
|
announcement = None
|
|
for dt, dd in zip(articleTag.find_all("dt"), articleTag.find_all("dd")):
|
|
if dt.text.strip() == "Ausschreibung:":
|
|
announcement = dd.find("a")['href']
|
|
if dt.text.strip() == "Veranstalter:":
|
|
organizer = dd.text.strip()
|
|
if dt.text.strip() == "Veranstaltungsort:":
|
|
place = dd.text.strip()
|
|
if dt.text.strip() == "Veranstaltungsadresse:":
|
|
address = re.sub("\s+", " ", dd.text.strip())
|
|
if announcement is None:
|
|
print(f"no announcement in {e.prettify()}")
|
|
|
|
ageGroups = [ ak.text.strip() for ak in ageGroupsDiv.find_all("span") ]
|
|
return wettkampf(title, year, int(date[0][1]), int(date[0][0]), announcement, organizer, place, address, ageGroups)
|
|
|
|
@staticmethod
|
|
def from_url(url, year=None):
|
|
if year is None:
|
|
year = datetime.date.today().year
|
|
return wettkampf.from_htmlString( requests.get( url ).content, year )
|