Zum Commit vorgemerkte Änderungen:

neue Datei: src/wkScraper/jvsCal.py neue Datei: src/wkScraper/wkScraper-JvsCal.py
2020-10-11 09:40:27 +02:00
parent 4b1c208bfe
commit 7486a9e886
2 changed files with 362 additions and 0 deletions
--- a/wkOrg/src/wkScraper/jvsCal.py
+++ b/wkOrg/src/wkScraper/jvsCal.py
@@ -0,0 +1,165 @@
+#! /usr/bin/env python3
+# -*- coding: UTF-8 -*-
+
+from bs4 import BeautifulSoup
+import datetime
+import json
+import re
+import requests
+
+jvsCalUrl = "https://judoverbandsachsen.de/kalender/?show=all"
+
+def parseJvsCal(url=jvsCalUrl, minYear = 0, minMonth = 0, onlyWithAks=False):
+	'''
+	Parse the calender page of the jvs
+	
+	returns dictionary of dictionaries of list cal[year][month] = listOfUrls
+	'''
+	jvsCalPage = requests.get(url)
+	jvsCalSoup = BeautifulSoup(jvsCalPage.content, "html.parser")
+	jvsCalEventListItems = jvsCalSoup.find(id="eventListItems")
+	jvsCalEventMonts = jvsCalEventListItems.find_all("div", class_="month")
+
+	jvsWkList = {}
+
+	for m in jvsCalEventMonts:
+		if m.has_attr("data-month"):
+			yearMonth = m.get("data-month")
+			year = int(yearMonth[0:4])
+			if year < minYear:
+				continue
+			if not year in jvsWkList:
+				jvsWkList[year] = {}
+			month = int(yearMonth[4:6])
+			if year==minYear and month<minMonth:
+				continue
+			if month not in jvsWkList[year]:
+				jvsWkList[year][month] = []
+			events = m.find_all(class_="posts")
+			for event in events:
+#				url = event.find(class_="col-4").find("a")['href']
+				time, title, aks, place, url = parseJvsEvent(event, year)
+				if aks is None and onlyWithAks:
+					continue
+				jvsWkList[year][month].append({
+					"time": time,
+					"title": title,
+					"aks": aks,
+					"place": place,
+					"url": url
+				})
+	return jvsWkList
+
+def parseJvsEvent(jvsEvent, year):
+	'''
+	Parse an event out of the jvsCalendar
+	
+	return 
+	'''
+	try:
+		aks = None
+		for e in jvsEvent.find_all(class_="col-2"):
+			if e.find("time"):
+				time = [ t.strip() for t in re.sub( "\s+", " ", e.find("time").text.strip() ).split("-")]
+				time = [ t.split(".") for t in time ]
+				time = [ "-".join( [str(year), t[1], t[0]] ) for t in time ]
+			if e.find("span"):
+				aks = [ ak.text.strip() for ak in e.find_all("span") ]
+				aks = [ ak for ak in aks if ak != "" ]
+				if len(aks) == 0:
+					aks=None
+		place = jvsEvent.find(class_="col-3").text.strip()
+		if place == "":
+			place = None
+		title = jvsEvent.find(class_="col-4").find("a").text.strip()
+		url = jvsEvent.find(class_="col-4").find("a")['href']
+		titleFull = jvsEvent.find(class_="col-4").text.strip()
+		assert(title==titleFull)
+	except Exception as e:
+		print(f"Error '{e}' parsing:")
+		print(jvsEvent.prettify())
+	
+	return time, title, aks, place, url
+
+def download_file(url):
+    local_filename = url.split('/')[-1]
+    # NOTE the stream=True parameter below
+    with requests.get(url, stream=True) as r:
+        r.raise_for_status()
+        with open(local_filename, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=8192): 
+                # If you have chunk encoded response uncomment if
+                # and set chunk_size parameter to None.
+                #if chunk: 
+                f.write(chunk)
+    return local_filename
+
+def getWk(urlList):
+	s=requests.Session()
+	return [ wettkampf.from_htmlString(s.get(url).content) for url in urlList]
+
+class wettkampf:
+	def __init__(self, title, year, month, day, announcement, organizer, place, address, ageGroups):
+		self.title = str(title)
+		self.date = datetime.date(year, month, day)
+		self.announcement = str(announcement)
+		self.organizer = str(organizer)
+		self.place = str(place)
+		self.address = str(address)
+		self.ageGroups = ageGroups
+		return
+	def to_dict(self):
+		return {
+			'title': self.title,
+			'date': self.date.strftime("%Y-%m-%d"), 
+			'announcement': self.announcement,
+			'organizer': self.organizer,
+			'place': self.place,
+			'address': self.address,
+			'ageGroups': self.ageGroups
+		}
+	def to_json(self):
+		return json.dumps(self.to_dict())
+	@staticmethod
+	def from_htmlString(wkString, year=None):
+		'''
+		Create a wettkampf from a html string of the event in the 
+		calender of the JVS
+		
+		the html string is only the wettkampf specific part in the page 
+		of the wettkampf. Not the whole page and not the whole calender
+		'''
+		if year is None:
+			year = datetime.date.today().year
+		
+		wkSoup = BeautifulSoup(wkString, "html.parser")
+		
+		for e in wkSoup.find_all(class_="event-single"):
+			title = e.find("header").text.strip()
+			articleTag= e.find("article")
+			date = [ t.strip() for t in re.sub( "\s+", " ", articleTag.find("time").text.strip() ).split("-")]
+			date = [ t.split(".") for t in date ]
+			
+			[announcementDiv, placeDiv, ageGroupsDiv] = articleTag.find_all("div")
+			
+			announcement = None
+			for dt, dd in zip(articleTag.find_all("dt"), articleTag.find_all("dd")):
+				if dt.text.strip() == "Ausschreibung:":
+					announcement = dd.find("a")['href']
+				if dt.text.strip() == "Veranstalter:":
+					organizer = dd.text.strip()
+				if dt.text.strip() == "Veranstaltungsort:":
+					place = dd.text.strip()
+				if dt.text.strip() == "Veranstaltungsadresse:":
+					address = re.sub("\s+", " ", dd.text.strip())
+			if announcement is None:
+				print(f"no announcement in {e.prettify()}")
+			
+			ageGroups = [ ak.text.strip() for ak in ageGroupsDiv.find_all("span") ]
+		return wettkampf(title, year, int(date[0][1]), int(date[0][0]), announcement, organizer, place, address, ageGroups)
+
+	@staticmethod
+	def from_url(url, year=None):
+		if year is None:
+			year = datetime.date.today().year
+		return wettkampf.from_htmlString( requests.get( url ).content, year )
--- a/wkOrg/src/wkScraper/wkScraper-JvsCal.py
+++ b/wkOrg/src/wkScraper/wkScraper-JvsCal.py
@@ -0,0 +1,197 @@
+#! /usr/bin/env python3
+# -*- coding: UTF-8 -*-
+
+import re
+import requests
+from bs4 import BeautifulSoup
+import datetime
+import json
+
+import jvsCal
+
+
+class shiai:
+	def __init__(self, titel, date, ageGroups):
+		self.title = title
+		self.date = date
+		self.ageGroups = ageGroups
+		
+		return
+	def toDict(self):
+		return {
+			'title': self.title,
+			'date': self.date,
+			'ageGroups': self.ageGroups,
+		}
+	def toJson(self):
+		return json.dumps(self.toDict())
+
+class event:
+	'''
+	ein Event besteht aus:
+		- Zeitspanne
+			- Startdatum (+Endatum)
+		- Titel (Name)
+		- Ort
+		- Url
+	'''
+	def __init__(self, timespan, title, place=None, url=None):
+		self.date = datetime.date.fromisoformat( timespan[0] )
+		self.endDate = None
+		if len(timespan) >= 2:
+			self.endDate = datetime.date.fromisoformat( timespan[1] )
+		self.title = title
+		self.place = place
+		self.url = url
+		return
+	def toDict(self):
+		wkDict = {}
+		wkDict["date"] = str(self.date)
+		if self.endDate is not None:
+			wkDict["endDate"] = str(self.endDate)
+		wkDict["title"] = self.title
+		if self.place is not None:
+			wkDict["place"] = self.place
+		if self.url is not None:
+			wkDict["url"] = self.url
+		return wkDict
+	def toJson(self):
+		return json.dumps(self.toDict())
+
+class wk(event):
+	def __init__(self, timespan, title, akList, place=None, url=None):
+		super().__init__(timespan, title, place, url)
+		self.akList = akList
+	def toDict(self):
+		wkDict = super().toDict()
+		wkDict["akList"] = self.akList
+		return wkDict
+
+def parseJvsEvent(jvsEvent, onlyIfWithAk=False):
+	try:
+		aks = None
+		for e in jvsEvent.find_all(class_="col-2"):
+			if e.find("time"):
+				time = [ t.strip() for t in re.sub( "\s+", " ", e.find("time").text.strip() ).split("-")]
+				time = [ t.split(".") for t in time ]
+				time = [ "-".join( [year, t[1], t[0]] ) for t in time ]
+			if e.find("span"):
+				aks = [ ak.text.strip() for ak in e.find_all("span") ]
+				aks = [ ak for ak in aks if ak != "" ]
+				if len(aks) == 0:
+					aks=None
+		place = event.find(class_="col-3").text.strip()
+		if place == "":
+			place = None
+		title = event.find(class_="col-4").find("a").text.strip()
+		url = event.find(class_="col-4").find("a")['href']
+		titleFull = event.find(class_="col-4").text.strip()
+		assert(title==titleFull)
+	except:
+		print("Error parsing:")
+		print(event.prettify())
+	
+	if aks is not None:
+		return wk(time, title, aks, place, url)
+	else:
+		if onlyIfWithAk:
+			return None
+		else:
+			return event(time, title, place, url)
+
+
+events = jvsCal.parseJvsCal(minYear=datetime.date.today().year, minMonth=datetime.date.today().month, onlyWithAks=True)
+#events = jvsCal.parseJvsCal()
+
+#print( jvsCal.parseJvsCal(minYear=datetime.date.today().year, minMonth=datetime.date.today().month, onlyWithAks=True) )
+
+print( [ e['url'] for y in events for m in events[y] for e in events[y][m]] )
+
+print( jvsCal.getWk([ e['url'] for y in events for m in events[y] for e in events[y][m]]) )
+
+exit()
+
+jvsCalShiaiUrl = "https://judoverbandsachsen.de/events/23-raeucherkerzenpokal/"
+
+testWk = jvsCal.wettkampf.from_url( jvsCalShiaiUrl )
+print(testWk.to_json())
+exit()
+
+
+#with open("rkp.html", "w") as f:
+#    f.write(BeautifulSoup(requests.get("https://judoverbandsachsen.de/events/23-raeucherkerzenpokal/").content, "html.parser").prettify())
+#exit()
+
+
+url = "https://judoverbandsachsen.de/kalender/?show=all"
+
+jvsCalShiaiUrl = "https://judoverbandsachsen.de/events/23-raeucherkerzenpokal/"
+#jvsCalShiaiUrl = "file://rkp.html"
+#jvsCalShiaiPage = requests.get(jvsCalShiaiUrl)
+#jvsCalShiaiSoup = BeautifulSoup(jvsCalShiaiPage.content, "html.parser")
+jvsCalShiaiSoup = BeautifulSoup(open("rkp.html"), "html.parser")
+
+year = "2020"
+for e in jvsCalShiaiSoup.find_all(class_="event-single"):
+	print(e.prettify())
+	title = e.find("header").text.strip()
+	articleTag= e.find("article")
+	date = [ t.strip() for t in re.sub( "\s+", " ", articleTag.find("time").text.strip() ).split("-")]
+	date = [ t.split(".") for t in date ]
+	date = [ "-".join( [year, t[1], t[0]] ) for t in date ]
+	
+	[announcementDiv, placeDiv, ageGroupsDiv] = articleTag.find_all("div")
+	
+	announcement = {}
+	place = {}
+	for dt, dd in zip(articleTag.find_all("dt"), articleTag.find_all("dd")):
+		if dt.text.strip() == "Ausschreibung:":
+			announcement['url'] = dd.find("a")['href']
+		if dt.text.strip() == "Veranstalter:":
+			announcement['organizer'] = dd.text.strip()
+		if dt.text.strip() == "Veranstaltungsort:":
+			place['name'] = dd.text.strip()
+		if dt.text.strip() == "Veranstaltungsadresse:":
+			place['address'] = re.sub("\s+", " ", dd.text.strip())
+#		print(dt.text, dd.text)
+	
+	ageGroups = [ ak.text.strip() for ak in ageGroupsDiv.find_all("span") ]
+
+print(f"title: {title})")
+print(f"date: {date})")
+print(f"announcement: {announcement}")
+print(f"place: {place}")
+print(f"ageGroups: {ageGroups}")
+exit()
+
+
+jvsCalPage = requests.get(url)
+
+jvsCalSoup = BeautifulSoup(jvsCalPage.content, "html.parser")
+
+#jvsCalEventList = jvsCalSoup.find(id="eventList")
+#print(jvsCalEventList.prettify())
+
+jvsCalEventListItems = jvsCalSoup.find(id="eventListItems")
+#print(jvsCalEventListItems.prettify())
+
+jvsCalEventMonts = jvsCalEventListItems.find_all("div", class_="month")
+
+jvsWkList = []
+
+for m in jvsCalEventMonts:
+	if m.has_attr("data-month"):
+		yearMonth = m.get("data-month")
+		year = (yearMonth[0:4])
+		month = yearMonth[4:6]
+		print(f"Jahr: {year}, Monat: {month}")
+		events = m.find_all(class_="posts")
+		for event in events:
+				parsedEvent = parseJvsEvent(event, onlyIfWithAk=True)
+				if parsedEvent is not None:
+					jvsWkList.append(parsedEvent)
+				else:
+					print("no data-month")
+
+for w in jvsWkList:
+	print(w.toJson())