cwsvJudo/wkOrg/src/wkScraper/jvsCal.py

#! /usr/bin/env python3
# -*- coding: UTF-8 -*-

import requests
from bs4 import BeautifulSoup
import datetime
import json
import re

jvsCalUrl = "https://judoverbandsachsen.de/kalender/?show=all"

def parseJvsCal(url=jvsCalUrl, minYear = 0, minMonth = 0, onlyWithAks=False):
	'''
	Parse the calender page of the jvs

	returns dictionary of dictionaries of list cal[year][month] = listOfUrls
	'''
	jvsCalPage = requests.get(url)
	jvsCalSoup = BeautifulSoup(jvsCalPage.content, "html.parser")
	jvsCalEventListItems = jvsCalSoup.find(id="eventListItems")
	jvsCalEventMonts = jvsCalEventListItems.find_all("div", class_="month")

	jvsWkList = {}

	for m in jvsCalEventMonts:
		if m.has_attr("data-month"):
			yearMonth = m.get("data-month")
			year = int(yearMonth[0:4])
			if year < minYear:
				continue
			if not year in jvsWkList:
				jvsWkList[year] = {}
			month = int(yearMonth[4:6])
			if year==minYear and month<minMonth:
				continue
			if month not in jvsWkList[year]:
				jvsWkList[year][month] = []
			events = m.find_all(class_="posts")
			for event in events:
#				url = event.find(class_="col-4").find("a")['href']
				time, title, aks, place, url = parseJvsEvent(event, year)
				if aks is None and onlyWithAks:
					continue
				jvsWkList[year][month].append({
					"time": time,
					"title": title,
					"aks": aks,
					"place": place,
					"url": url
				})
	return jvsWkList

def parseJvsEvent(jvsEvent, year):
	'''
	Parse an event out of the jvsCalendar

	return
	'''
	try:
		aks = None
		for e in jvsEvent.find_all(class_="col-2"):
			if e.find("time"):
				time = [ t.strip() for t in re.sub( "\s+", " ", e.find("time").text.strip() ).split("-")]
				time = [ t.split(".") for t in time ]
				time = [ "-".join( [str(year), t[1], t[0]] ) for t in time ]
			if e.find("span"):
				aks = [ ak.text.strip() for ak in e.find_all("span") ]
				aks = [ ak for ak in aks if ak != "" ]
				if len(aks) == 0:
					aks=None
		place = jvsEvent.find(class_="col-3").text.strip()
		if place == "":
			place = None
		title = jvsEvent.find(class_="col-4").find("a").text.strip()
		url = jvsEvent.find(class_="col-4").find("a")['href']
		titleFull = jvsEvent.find(class_="col-4").text.strip()
		assert(title==titleFull)
	except Exception as e:
		print(f"Error '{e}' parsing:")
		print(jvsEvent.prettify())

	return time, title, aks, place, url

def download_file(url):
    local_filename = url.split('/')[-1]
    # NOTE the stream=True parameter below
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                # If you have chunk encoded response uncomment if
                # and set chunk_size parameter to None.
                #if chunk:
                f.write(chunk)
    return local_filename

def getWk(urlList):
	s=requests.Session()
	return [ wettkampf.from_htmlString(s.get(url).content) for url in urlList]

class wettkampf:
	def __init__(self, title, year, month, day, announcement, organizer, place, address, ageGroups):
		self.title = str(title)
		self.date = datetime.date(year, month, day)
		self.announcement = str(announcement)
		self.organizer = str(organizer)
		self.place = str(place)
		self.address = str(address)
		self.ageGroups = ageGroups
		return
	def to_dict(self):
		return {
			'title': self.title,
			'date': self.date.strftime("%Y-%m-%d"),
			'announcement': self.announcement,
			'organizer': self.organizer,
			'place': self.place,
			'address': self.address,
			'ageGroups': self.ageGroups
		}
	def to_json(self, indent=2):
		return json.dumps(self.to_dict(), indent=indent)
	@staticmethod
	def from_htmlString(wkString, year=None):
		'''
		Create a wettkampf from a html string of the event in the
		calender of the JVS

		the html string is only the wettkampf specific part in the page
		of the wettkampf. Not the whole page and not the whole calender
		'''
		if year is None:
			year = datetime.date.today().year

		wkSoup = BeautifulSoup(wkString, "html.parser")

		for e in wkSoup.find_all(class_="event-single"):
			title = e.find("header").text.strip()
			articleTag= e.find("article")
			date = [ t.strip() for t in re.sub( "\s+", " ", articleTag.find("time").text.strip() ).split("-")]
			date = [ t.split(".") for t in date ]

			[announcementDiv, placeDiv, ageGroupsDiv] = articleTag.find_all("div")

			announcement = None
			for dt, dd in zip(articleTag.find_all("dt"), articleTag.find_all("dd")):
				if dt.text.strip() == "Ausschreibung:":
					announcement = dd.find("a")['href']
				if dt.text.strip() == "Veranstalter:":
					organizer = dd.text.strip()
				if dt.text.strip() == "Veranstaltungsort:":
					place = dd.text.strip()
				if dt.text.strip() == "Veranstaltungsadresse:":
					address = re.sub("\s+", " ", dd.text.strip())
			if announcement is None:
				print(f"no announcement in {e.prettify()}")

			ageGroups = [ ak.text.strip() for ak in ageGroupsDiv.find_all("span") ]
		return wettkampf(title, year, int(date[0][1]), int(date[0][0]), announcement, organizer, place, address, ageGroups)

	@staticmethod
	def from_url(url, year=None):
		if year is None:
			year = datetime.date.today().year
		return wettkampf.from_htmlString( requests.get( url ).content, year )