cwsvJudo/wkOrg/src/wkScraper/wkScraper-JvsCal.py

#! /usr/bin/env python3
# -*- coding: UTF-8 -*-

import re
import requests
from bs4 import BeautifulSoup
import datetime
import json

import jvsCal


class shiai:
	def __init__(self, titel, date, ageGroups):
		self.title = title
		self.date = date
		self.ageGroups = ageGroups

		return
	def toDict(self):
		return {
			'title': self.title,
			'date': self.date,
			'ageGroups': self.ageGroups,
		}
	def toJson(self):
		return json.dumps(self.toDict())

class event:
	'''
	ein Event besteht aus:
		- Zeitspanne
			- Startdatum (+Endatum)
		- Titel (Name)
		- Ort
		- Url
	'''
	def __init__(self, timespan, title, place=None, url=None):
		self.date = datetime.date.fromisoformat( timespan[0] )
		self.endDate = None
		if len(timespan) >= 2:
			self.endDate = datetime.date.fromisoformat( timespan[1] )
		self.title = title
		self.place = place
		self.url = url
		return
	def toDict(self):
		wkDict = {}
		wkDict["date"] = str(self.date)
		if self.endDate is not None:
			wkDict["endDate"] = str(self.endDate)
		wkDict["title"] = self.title
		if self.place is not None:
			wkDict["place"] = self.place
		if self.url is not None:
			wkDict["url"] = self.url
		return wkDict
	def toJson(self):
		return json.dumps(self.toDict())

class wk(event):
	def __init__(self, timespan, title, akList, place=None, url=None):
		super().__init__(timespan, title, place, url)
		self.akList = akList
	def toDict(self):
		wkDict = super().toDict()
		wkDict["akList"] = self.akList
		return wkDict

def parseJvsEvent(jvsEvent, onlyIfWithAk=False):
	try:
		aks = None
		for e in jvsEvent.find_all(class_="col-2"):
			if e.find("time"):
				time = [ t.strip() for t in re.sub( "\s+", " ", e.find("time").text.strip() ).split("-")]
				time = [ t.split(".") for t in time ]
				time = [ "-".join( [year, t[1], t[0]] ) for t in time ]
			if e.find("span"):
				aks = [ ak.text.strip() for ak in e.find_all("span") ]
				aks = [ ak for ak in aks if ak != "" ]
				if len(aks) == 0:
					aks=None
		place = event.find(class_="col-3").text.strip()
		if place == "":
			place = None
		title = event.find(class_="col-4").find("a").text.strip()
		url = event.find(class_="col-4").find("a")['href']
		titleFull = event.find(class_="col-4").text.strip()
		assert(title==titleFull)
	except:
		print("Error parsing:")
		print(event.prettify())

	if aks is not None:
		return wk(time, title, aks, place, url)
	else:
		if onlyIfWithAk:
			return None
		else:
			return event(time, title, place, url)


if __name__=="__main__":
	events = jvsCal.parseJvsCal(minYear=datetime.date.today().year, minMonth=datetime.date.today().month, onlyWithAks=True)
	# print(f"{json.dumps(events, indent=2)}")

	for year in events:
		for month in events[year]:
			for event in events[year][month]:
				print(f"{event}")
				wk = jvsCal.wettkampf.from_url( event['url'] )
				print(f"{wk.to_json()}")
	exit(-1)

	print( [ e['url'] for y in events for m in events[y] for e in events[y][m]] )

	print( jvsCal.getWk([ e['url'] for y in events for m in events[y] for e in events[y][m]]) )

	exit()

	jvsCalShiaiUrl = "https://judoverbandsachsen.de/events/23-raeucherkerzenpokal/"

	testWk = jvsCal.wettkampf.from_url( jvsCalShiaiUrl )
	print(testWk.to_json())
	exit()


	#with open("rkp.html", "w") as f:
	#    f.write(BeautifulSoup(requests.get("https://judoverbandsachsen.de/events/23-raeucherkerzenpokal/").content, "html.parser").prettify())
	#exit()


url = "https://judoverbandsachsen.de/kalender/?show=all"

jvsCalShiaiUrl = "https://judoverbandsachsen.de/events/23-raeucherkerzenpokal/"
#jvsCalShiaiUrl = "file://rkp.html"
#jvsCalShiaiPage = requests.get(jvsCalShiaiUrl)
#jvsCalShiaiSoup = BeautifulSoup(jvsCalShiaiPage.content, "html.parser")
jvsCalShiaiSoup = BeautifulSoup(open("rkp.html"), "html.parser")

year = "2020"
for e in jvsCalShiaiSoup.find_all(class_="event-single"):
	print(e.prettify())
	title = e.find("header").text.strip()
	articleTag= e.find("article")
	date = [ t.strip() for t in re.sub( "\s+", " ", articleTag.find("time").text.strip() ).split("-")]
	date = [ t.split(".") for t in date ]
	date = [ "-".join( [year, t[1], t[0]] ) for t in date ]

	[announcementDiv, placeDiv, ageGroupsDiv] = articleTag.find_all("div")

	announcement = {}
	place = {}
	for dt, dd in zip(articleTag.find_all("dt"), articleTag.find_all("dd")):
		if dt.text.strip() == "Ausschreibung:":
			announcement['url'] = dd.find("a")['href']
		if dt.text.strip() == "Veranstalter:":
			announcement['organizer'] = dd.text.strip()
		if dt.text.strip() == "Veranstaltungsort:":
			place['name'] = dd.text.strip()
		if dt.text.strip() == "Veranstaltungsadresse:":
			place['address'] = re.sub("\s+", " ", dd.text.strip())
#		print(dt.text, dd.text)

	ageGroups = [ ak.text.strip() for ak in ageGroupsDiv.find_all("span") ]

print(f"title: {title})")
print(f"date: {date})")
print(f"announcement: {announcement}")
print(f"place: {place}")
print(f"ageGroups: {ageGroups}")
exit()


jvsCalPage = requests.get(url)

jvsCalSoup = BeautifulSoup(jvsCalPage.content, "html.parser")

#jvsCalEventList = jvsCalSoup.find(id="eventList")
#print(jvsCalEventList.prettify())

jvsCalEventListItems = jvsCalSoup.find(id="eventListItems")
#print(jvsCalEventListItems.prettify())

jvsCalEventMonts = jvsCalEventListItems.find_all("div", class_="month")

jvsWkList = []

for m in jvsCalEventMonts:
	if m.has_attr("data-month"):
		yearMonth = m.get("data-month")
		year = (yearMonth[0:4])
		month = yearMonth[4:6]
		print(f"Jahr: {year}, Monat: {month}")
		events = m.find_all(class_="posts")
		for event in events:
				parsedEvent = parseJvsEvent(event, onlyIfWithAk=True)
				if parsedEvent is not None:
					jvsWkList.append(parsedEvent)
				else:
					print("no data-month")

for w in jvsWkList:
	print(w.toJson())