Zum Commit vorgemerkte Änderungen:
neue Datei: src/wkScraper/jvsCal.py neue Datei: src/wkScraper/wkScraper-JvsCal.py
This commit is contained in:
165
wkOrg/src/wkScraper/jvsCal.py
Normal file
165
wkOrg/src/wkScraper/jvsCal.py
Normal file
@@ -0,0 +1,165 @@
|
||||
#! /usr/bin/env python3
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import datetime
|
||||
import json
|
||||
import re
|
||||
import requests
|
||||
|
||||
jvsCalUrl = "https://judoverbandsachsen.de/kalender/?show=all"
|
||||
|
||||
def parseJvsCal(url=jvsCalUrl, minYear = 0, minMonth = 0, onlyWithAks=False):
|
||||
'''
|
||||
Parse the calender page of the jvs
|
||||
|
||||
returns dictionary of dictionaries of list cal[year][month] = listOfUrls
|
||||
'''
|
||||
jvsCalPage = requests.get(url)
|
||||
jvsCalSoup = BeautifulSoup(jvsCalPage.content, "html.parser")
|
||||
jvsCalEventListItems = jvsCalSoup.find(id="eventListItems")
|
||||
jvsCalEventMonts = jvsCalEventListItems.find_all("div", class_="month")
|
||||
|
||||
jvsWkList = {}
|
||||
|
||||
for m in jvsCalEventMonts:
|
||||
if m.has_attr("data-month"):
|
||||
yearMonth = m.get("data-month")
|
||||
year = int(yearMonth[0:4])
|
||||
if year < minYear:
|
||||
continue
|
||||
if not year in jvsWkList:
|
||||
jvsWkList[year] = {}
|
||||
month = int(yearMonth[4:6])
|
||||
if year==minYear and month<minMonth:
|
||||
continue
|
||||
if month not in jvsWkList[year]:
|
||||
jvsWkList[year][month] = []
|
||||
events = m.find_all(class_="posts")
|
||||
for event in events:
|
||||
# url = event.find(class_="col-4").find("a")['href']
|
||||
time, title, aks, place, url = parseJvsEvent(event, year)
|
||||
if aks is None and onlyWithAks:
|
||||
continue
|
||||
jvsWkList[year][month].append({
|
||||
"time": time,
|
||||
"title": title,
|
||||
"aks": aks,
|
||||
"place": place,
|
||||
"url": url
|
||||
})
|
||||
return jvsWkList
|
||||
|
||||
def parseJvsEvent(jvsEvent, year):
|
||||
'''
|
||||
Parse an event out of the jvsCalendar
|
||||
|
||||
return
|
||||
'''
|
||||
try:
|
||||
aks = None
|
||||
for e in jvsEvent.find_all(class_="col-2"):
|
||||
if e.find("time"):
|
||||
time = [ t.strip() for t in re.sub( "\s+", " ", e.find("time").text.strip() ).split("-")]
|
||||
time = [ t.split(".") for t in time ]
|
||||
time = [ "-".join( [str(year), t[1], t[0]] ) for t in time ]
|
||||
if e.find("span"):
|
||||
aks = [ ak.text.strip() for ak in e.find_all("span") ]
|
||||
aks = [ ak for ak in aks if ak != "" ]
|
||||
if len(aks) == 0:
|
||||
aks=None
|
||||
place = jvsEvent.find(class_="col-3").text.strip()
|
||||
if place == "":
|
||||
place = None
|
||||
title = jvsEvent.find(class_="col-4").find("a").text.strip()
|
||||
url = jvsEvent.find(class_="col-4").find("a")['href']
|
||||
titleFull = jvsEvent.find(class_="col-4").text.strip()
|
||||
assert(title==titleFull)
|
||||
except Exception as e:
|
||||
print(f"Error '{e}' parsing:")
|
||||
print(jvsEvent.prettify())
|
||||
|
||||
return time, title, aks, place, url
|
||||
|
||||
def download_file(url):
|
||||
local_filename = url.split('/')[-1]
|
||||
# NOTE the stream=True parameter below
|
||||
with requests.get(url, stream=True) as r:
|
||||
r.raise_for_status()
|
||||
with open(local_filename, 'wb') as f:
|
||||
for chunk in r.iter_content(chunk_size=8192):
|
||||
# If you have chunk encoded response uncomment if
|
||||
# and set chunk_size parameter to None.
|
||||
#if chunk:
|
||||
f.write(chunk)
|
||||
return local_filename
|
||||
|
||||
def getWk(urlList):
|
||||
s=requests.Session()
|
||||
return [ wettkampf.from_htmlString(s.get(url).content) for url in urlList]
|
||||
|
||||
class wettkampf:
|
||||
def __init__(self, title, year, month, day, announcement, organizer, place, address, ageGroups):
|
||||
self.title = str(title)
|
||||
self.date = datetime.date(year, month, day)
|
||||
self.announcement = str(announcement)
|
||||
self.organizer = str(organizer)
|
||||
self.place = str(place)
|
||||
self.address = str(address)
|
||||
self.ageGroups = ageGroups
|
||||
return
|
||||
def to_dict(self):
|
||||
return {
|
||||
'title': self.title,
|
||||
'date': self.date.strftime("%Y-%m-%d"),
|
||||
'announcement': self.announcement,
|
||||
'organizer': self.organizer,
|
||||
'place': self.place,
|
||||
'address': self.address,
|
||||
'ageGroups': self.ageGroups
|
||||
}
|
||||
def to_json(self):
|
||||
return json.dumps(self.to_dict())
|
||||
@staticmethod
|
||||
def from_htmlString(wkString, year=None):
|
||||
'''
|
||||
Create a wettkampf from a html string of the event in the
|
||||
calender of the JVS
|
||||
|
||||
the html string is only the wettkampf specific part in the page
|
||||
of the wettkampf. Not the whole page and not the whole calender
|
||||
'''
|
||||
if year is None:
|
||||
year = datetime.date.today().year
|
||||
|
||||
wkSoup = BeautifulSoup(wkString, "html.parser")
|
||||
|
||||
for e in wkSoup.find_all(class_="event-single"):
|
||||
title = e.find("header").text.strip()
|
||||
articleTag= e.find("article")
|
||||
date = [ t.strip() for t in re.sub( "\s+", " ", articleTag.find("time").text.strip() ).split("-")]
|
||||
date = [ t.split(".") for t in date ]
|
||||
|
||||
[announcementDiv, placeDiv, ageGroupsDiv] = articleTag.find_all("div")
|
||||
|
||||
announcement = None
|
||||
for dt, dd in zip(articleTag.find_all("dt"), articleTag.find_all("dd")):
|
||||
if dt.text.strip() == "Ausschreibung:":
|
||||
announcement = dd.find("a")['href']
|
||||
if dt.text.strip() == "Veranstalter:":
|
||||
organizer = dd.text.strip()
|
||||
if dt.text.strip() == "Veranstaltungsort:":
|
||||
place = dd.text.strip()
|
||||
if dt.text.strip() == "Veranstaltungsadresse:":
|
||||
address = re.sub("\s+", " ", dd.text.strip())
|
||||
if announcement is None:
|
||||
print(f"no announcement in {e.prettify()}")
|
||||
|
||||
ageGroups = [ ak.text.strip() for ak in ageGroupsDiv.find_all("span") ]
|
||||
return wettkampf(title, year, int(date[0][1]), int(date[0][0]), announcement, organizer, place, address, ageGroups)
|
||||
|
||||
@staticmethod
|
||||
def from_url(url, year=None):
|
||||
if year is None:
|
||||
year = datetime.date.today().year
|
||||
return wettkampf.from_htmlString( requests.get( url ).content, year )
|
||||
Reference in New Issue
Block a user