Zum Commit vorgemerkte Änderungen:

neue Datei: src/wkScraper/jvsCal.py neue Datei: src/wkScraper/wkScraper-JvsCal.py
2020-10-11 09:40:27 +02:00
parent 4b1c208bfe
commit 7486a9e886
2 changed files with 362 additions and 0 deletions
--- a/wkOrg/src/wkScraper/jvsCal.py
+++ b/wkOrg/src/wkScraper/jvsCal.py
@@ -0,0 +1,165 @@
+#! /usr/bin/env python3
+# -*- coding: UTF-8 -*-
+
+from bs4 import BeautifulSoup
+import datetime
+import json
+import re
+import requests
+
+jvsCalUrl = "https://judoverbandsachsen.de/kalender/?show=all"
+
+def parseJvsCal(url=jvsCalUrl, minYear = 0, minMonth = 0, onlyWithAks=False):
+	'''
+	Parse the calender page of the jvs
+	
+	returns dictionary of dictionaries of list cal[year][month] = listOfUrls
+	'''
+	jvsCalPage = requests.get(url)
+	jvsCalSoup = BeautifulSoup(jvsCalPage.content, "html.parser")
+	jvsCalEventListItems = jvsCalSoup.find(id="eventListItems")
+	jvsCalEventMonts = jvsCalEventListItems.find_all("div", class_="month")
+
+	jvsWkList = {}
+
+	for m in jvsCalEventMonts:
+		if m.has_attr("data-month"):
+			yearMonth = m.get("data-month")
+			year = int(yearMonth[0:4])
+			if year < minYear:
+				continue
+			if not year in jvsWkList:
+				jvsWkList[year] = {}
+			month = int(yearMonth[4:6])
+			if year==minYear and month<minMonth:
+				continue
+			if month not in jvsWkList[year]:
+				jvsWkList[year][month] = []
+			events = m.find_all(class_="posts")
+			for event in events:
+#				url = event.find(class_="col-4").find("a")['href']
+				time, title, aks, place, url = parseJvsEvent(event, year)
+				if aks is None and onlyWithAks:
+					continue
+				jvsWkList[year][month].append({
+					"time": time,
+					"title": title,
+					"aks": aks,
+					"place": place,
+					"url": url
+				})
+	return jvsWkList
+
+def parseJvsEvent(jvsEvent, year):
+	'''
+	Parse an event out of the jvsCalendar
+	
+	return 
+	'''
+	try:
+		aks = None
+		for e in jvsEvent.find_all(class_="col-2"):
+			if e.find("time"):
+				time = [ t.strip() for t in re.sub( "\s+", " ", e.find("time").text.strip() ).split("-")]
+				time = [ t.split(".") for t in time ]
+				time = [ "-".join( [str(year), t[1], t[0]] ) for t in time ]
+			if e.find("span"):
+				aks = [ ak.text.strip() for ak in e.find_all("span") ]
+				aks = [ ak for ak in aks if ak != "" ]
+				if len(aks) == 0:
+					aks=None
+		place = jvsEvent.find(class_="col-3").text.strip()
+		if place == "":
+			place = None
+		title = jvsEvent.find(class_="col-4").find("a").text.strip()
+		url = jvsEvent.find(class_="col-4").find("a")['href']
+		titleFull = jvsEvent.find(class_="col-4").text.strip()
+		assert(title==titleFull)
+	except Exception as e:
+		print(f"Error '{e}' parsing:")
+		print(jvsEvent.prettify())
+	
+	return time, title, aks, place, url
+
+def download_file(url):
+    local_filename = url.split('/')[-1]
+    # NOTE the stream=True parameter below
+    with requests.get(url, stream=True) as r:
+        r.raise_for_status()
+        with open(local_filename, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=8192): 
+                # If you have chunk encoded response uncomment if
+                # and set chunk_size parameter to None.
+                #if chunk: 
+                f.write(chunk)
+    return local_filename
+
+def getWk(urlList):
+	s=requests.Session()
+	return [ wettkampf.from_htmlString(s.get(url).content) for url in urlList]
+
+class wettkampf:
+	def __init__(self, title, year, month, day, announcement, organizer, place, address, ageGroups):
+		self.title = str(title)
+		self.date = datetime.date(year, month, day)
+		self.announcement = str(announcement)
+		self.organizer = str(organizer)
+		self.place = str(place)
+		self.address = str(address)
+		self.ageGroups = ageGroups
+		return
+	def to_dict(self):
+		return {
+			'title': self.title,
+			'date': self.date.strftime("%Y-%m-%d"), 
+			'announcement': self.announcement,
+			'organizer': self.organizer,
+			'place': self.place,
+			'address': self.address,
+			'ageGroups': self.ageGroups
+		}
+	def to_json(self):
+		return json.dumps(self.to_dict())
+	@staticmethod
+	def from_htmlString(wkString, year=None):
+		'''
+		Create a wettkampf from a html string of the event in the 
+		calender of the JVS
+		
+		the html string is only the wettkampf specific part in the page 
+		of the wettkampf. Not the whole page and not the whole calender
+		'''
+		if year is None:
+			year = datetime.date.today().year
+		
+		wkSoup = BeautifulSoup(wkString, "html.parser")
+		
+		for e in wkSoup.find_all(class_="event-single"):
+			title = e.find("header").text.strip()
+			articleTag= e.find("article")
+			date = [ t.strip() for t in re.sub( "\s+", " ", articleTag.find("time").text.strip() ).split("-")]
+			date = [ t.split(".") for t in date ]
+			
+			[announcementDiv, placeDiv, ageGroupsDiv] = articleTag.find_all("div")
+			
+			announcement = None
+			for dt, dd in zip(articleTag.find_all("dt"), articleTag.find_all("dd")):
+				if dt.text.strip() == "Ausschreibung:":
+					announcement = dd.find("a")['href']
+				if dt.text.strip() == "Veranstalter:":
+					organizer = dd.text.strip()
+				if dt.text.strip() == "Veranstaltungsort:":
+					place = dd.text.strip()
+				if dt.text.strip() == "Veranstaltungsadresse:":
+					address = re.sub("\s+", " ", dd.text.strip())
+			if announcement is None:
+				print(f"no announcement in {e.prettify()}")
+			
+			ageGroups = [ ak.text.strip() for ak in ageGroupsDiv.find_all("span") ]
+		return wettkampf(title, year, int(date[0][1]), int(date[0][0]), announcement, organizer, place, address, ageGroups)
+
+	@staticmethod
+	def from_url(url, year=None):
+		if year is None:
+			year = datetime.date.today().year
+		return wettkampf.from_htmlString( requests.get( url ).content, year )