Zum Commit vorgemerkte Änderungen:

neue Datei: src/wkScraper/jvsCal.py neue Datei: src/wkScraper/wkScraper-JvsCal.py
2020-10-11 09:40:27 +02:00
parent 4b1c208bfe
commit 7486a9e886
2 changed files with 362 additions and 0 deletions
--- a/wkOrg/src/wkScraper/jvsCal.py
+++ b/wkOrg/src/wkScraper/jvsCal.py
@@ -0,0 +1,165 @@
 #! /usr/bin/env python3
 # -*- coding: UTF-8 -*-
 from bs4 import BeautifulSoup
 import datetime
 import json
 import re
 import requests
 jvsCalUrl = "https://judoverbandsachsen.de/kalender/?show=all"
 def parseJvsCal(url=jvsCalUrl, minYear = 0, minMonth = 0, onlyWithAks=False):
 	'''
 	Parse the calender page of the jvs
 	returns dictionary of dictionaries of list cal[year][month] = listOfUrls
 	'''
 	jvsCalPage = requests.get(url)
 	jvsCalSoup = BeautifulSoup(jvsCalPage.content, "html.parser")
 	jvsCalEventListItems = jvsCalSoup.find(id="eventListItems")
 	jvsCalEventMonts = jvsCalEventListItems.find_all("div", class_="month")
 	jvsWkList = {}
 	for m in jvsCalEventMonts:
 		if m.has_attr("data-month"):
 			yearMonth = m.get("data-month")
 			year = int(yearMonth[0:4])
 			if year < minYear:
 				continue
 			if not year in jvsWkList:
 				jvsWkList[year] = {}
 			month = int(yearMonth[4:6])
 			if year==minYear and month<minMonth:
 				continue
 			if month not in jvsWkList[year]:
 				jvsWkList[year][month] = []
 			events = m.find_all(class_="posts")
 			for event in events:
 #				url = event.find(class_="col-4").find("a")['href']
 				time, title, aks, place, url = parseJvsEvent(event, year)
 				if aks is None and onlyWithAks:
 					continue
 				jvsWkList[year][month].append({
 					"time": time,
 					"title": title,
 					"aks": aks,
 					"place": place,
 					"url": url
 				})
 	return jvsWkList
 def parseJvsEvent(jvsEvent, year):
 	'''
 	Parse an event out of the jvsCalendar
 	return 
 	'''
 	try:
 		aks = None
 		for e in jvsEvent.find_all(class_="col-2"):
 			if e.find("time"):
 				time = [ t.strip() for t in re.sub( "\s+", " ", e.find("time").text.strip() ).split("-")]
 				time = [ t.split(".") for t in time ]
 				time = [ "-".join( [str(year), t[1], t[0]] ) for t in time ]
 			if e.find("span"):
 				aks = [ ak.text.strip() for ak in e.find_all("span") ]
 				aks = [ ak for ak in aks if ak != "" ]
 				if len(aks) == 0:
 					aks=None
 		place = jvsEvent.find(class_="col-3").text.strip()
 		if place == "":
 			place = None
 		title = jvsEvent.find(class_="col-4").find("a").text.strip()
 		url = jvsEvent.find(class_="col-4").find("a")['href']
 		titleFull = jvsEvent.find(class_="col-4").text.strip()
 		assert(title==titleFull)
 	except Exception as e:
 		print(f"Error '{e}' parsing:")
 		print(jvsEvent.prettify())
 	return time, title, aks, place, url
 def download_file(url):
    local_filename = url.split('/')[-1]
    # NOTE the stream=True parameter below
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                # If you have chunk encoded response uncomment if
                # and set chunk_size parameter to None.
                #if chunk: 
                f.write(chunk)
    return local_filename
 def getWk(urlList):
 	s=requests.Session()
 	return [ wettkampf.from_htmlString(s.get(url).content) for url in urlList]
 class wettkampf:
 	def __init__(self, title, year, month, day, announcement, organizer, place, address, ageGroups):
 		self.title = str(title)
 		self.date = datetime.date(year, month, day)
 		self.announcement = str(announcement)
 		self.organizer = str(organizer)
 		self.place = str(place)
 		self.address = str(address)
 		self.ageGroups = ageGroups
 		return
 	def to_dict(self):
 		return {
 			'title': self.title,
 			'date': self.date.strftime("%Y-%m-%d"), 
 			'announcement': self.announcement,
 			'organizer': self.organizer,
 			'place': self.place,
 			'address': self.address,
 			'ageGroups': self.ageGroups
 		}
 	def to_json(self):
 		return json.dumps(self.to_dict())
 	@staticmethod
 	def from_htmlString(wkString, year=None):
 		'''
 		Create a wettkampf from a html string of the event in the 
 		calender of the JVS
 		the html string is only the wettkampf specific part in the page 
 		of the wettkampf. Not the whole page and not the whole calender
 		'''
 		if year is None:
 			year = datetime.date.today().year
 		wkSoup = BeautifulSoup(wkString, "html.parser")
 		for e in wkSoup.find_all(class_="event-single"):
 			title = e.find("header").text.strip()
 			articleTag= e.find("article")
 			date = [ t.strip() for t in re.sub( "\s+", " ", articleTag.find("time").text.strip() ).split("-")]
 			date = [ t.split(".") for t in date ]
 			[announcementDiv, placeDiv, ageGroupsDiv] = articleTag.find_all("div")
 			announcement = None
 			for dt, dd in zip(articleTag.find_all("dt"), articleTag.find_all("dd")):
 				if dt.text.strip() == "Ausschreibung:":
 					announcement = dd.find("a")['href']
 				if dt.text.strip() == "Veranstalter:":
 					organizer = dd.text.strip()
 				if dt.text.strip() == "Veranstaltungsort:":
 					place = dd.text.strip()
 				if dt.text.strip() == "Veranstaltungsadresse:":
 					address = re.sub("\s+", " ", dd.text.strip())
 			if announcement is None:
 				print(f"no announcement in {e.prettify()}")
 			ageGroups = [ ak.text.strip() for ak in ageGroupsDiv.find_all("span") ]
 		return wettkampf(title, year, int(date[0][1]), int(date[0][0]), announcement, organizer, place, address, ageGroups)
 	@staticmethod
 	def from_url(url, year=None):
 		if year is None:
 			year = datetime.date.today().year
 		return wettkampf.from_htmlString( requests.get( url ).content, year )
--- a/wkOrg/src/wkScraper/wkScraper-JvsCal.py
+++ b/wkOrg/src/wkScraper/wkScraper-JvsCal.py
@@ -0,0 +1,197 @@
 #! /usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import re
 import requests
 from bs4 import BeautifulSoup
 import datetime
 import json
 import jvsCal
 class shiai:
 	def __init__(self, titel, date, ageGroups):
 		self.title = title
 		self.date = date
 		self.ageGroups = ageGroups
 		return
 	def toDict(self):
 		return {
 			'title': self.title,
 			'date': self.date,
 			'ageGroups': self.ageGroups,
 		}
 	def toJson(self):
 		return json.dumps(self.toDict())
 class event:
 	'''
 	ein Event besteht aus:
 		- Zeitspanne
 			- Startdatum (+Endatum)
 		- Titel (Name)
 		- Ort
 		- Url
 	'''
 	def __init__(self, timespan, title, place=None, url=None):
 		self.date = datetime.date.fromisoformat( timespan[0] )
 		self.endDate = None
 		if len(timespan) >= 2:
 			self.endDate = datetime.date.fromisoformat( timespan[1] )
 		self.title = title
 		self.place = place
 		self.url = url
 		return
 	def toDict(self):
 		wkDict = {}
 		wkDict["date"] = str(self.date)
 		if self.endDate is not None:
 			wkDict["endDate"] = str(self.endDate)
 		wkDict["title"] = self.title
 		if self.place is not None:
 			wkDict["place"] = self.place
 		if self.url is not None:
 			wkDict["url"] = self.url
 		return wkDict
 	def toJson(self):
 		return json.dumps(self.toDict())
 class wk(event):
 	def __init__(self, timespan, title, akList, place=None, url=None):
 		super().__init__(timespan, title, place, url)
 		self.akList = akList
 	def toDict(self):
 		wkDict = super().toDict()
 		wkDict["akList"] = self.akList
 		return wkDict
 def parseJvsEvent(jvsEvent, onlyIfWithAk=False):
 	try:
 		aks = None
 		for e in jvsEvent.find_all(class_="col-2"):
 			if e.find("time"):
 				time = [ t.strip() for t in re.sub( "\s+", " ", e.find("time").text.strip() ).split("-")]
 				time = [ t.split(".") for t in time ]
 				time = [ "-".join( [year, t[1], t[0]] ) for t in time ]
 			if e.find("span"):
 				aks = [ ak.text.strip() for ak in e.find_all("span") ]
 				aks = [ ak for ak in aks if ak != "" ]
 				if len(aks) == 0:
 					aks=None
 		place = event.find(class_="col-3").text.strip()
 		if place == "":
 			place = None
 		title = event.find(class_="col-4").find("a").text.strip()
 		url = event.find(class_="col-4").find("a")['href']
 		titleFull = event.find(class_="col-4").text.strip()
 		assert(title==titleFull)
 	except:
 		print("Error parsing:")
 		print(event.prettify())
 	if aks is not None:
 		return wk(time, title, aks, place, url)
 	else:
 		if onlyIfWithAk:
 			return None
 		else:
 			return event(time, title, place, url)
 events = jvsCal.parseJvsCal(minYear=datetime.date.today().year, minMonth=datetime.date.today().month, onlyWithAks=True)
 #events = jvsCal.parseJvsCal()
 #print( jvsCal.parseJvsCal(minYear=datetime.date.today().year, minMonth=datetime.date.today().month, onlyWithAks=True) )
 print( [ e['url'] for y in events for m in events[y] for e in events[y][m]] )
 print( jvsCal.getWk([ e['url'] for y in events for m in events[y] for e in events[y][m]]) )
 exit()
 jvsCalShiaiUrl = "https://judoverbandsachsen.de/events/23-raeucherkerzenpokal/"
 testWk = jvsCal.wettkampf.from_url( jvsCalShiaiUrl )
 print(testWk.to_json())
 exit()
 #with open("rkp.html", "w") as f:
 #    f.write(BeautifulSoup(requests.get("https://judoverbandsachsen.de/events/23-raeucherkerzenpokal/").content, "html.parser").prettify())
 #exit()
 url = "https://judoverbandsachsen.de/kalender/?show=all"
 jvsCalShiaiUrl = "https://judoverbandsachsen.de/events/23-raeucherkerzenpokal/"
 #jvsCalShiaiUrl = "file://rkp.html"
 #jvsCalShiaiPage = requests.get(jvsCalShiaiUrl)
 #jvsCalShiaiSoup = BeautifulSoup(jvsCalShiaiPage.content, "html.parser")
 jvsCalShiaiSoup = BeautifulSoup(open("rkp.html"), "html.parser")
 year = "2020"
 for e in jvsCalShiaiSoup.find_all(class_="event-single"):
 	print(e.prettify())
 	title = e.find("header").text.strip()
 	articleTag= e.find("article")
 	date = [ t.strip() for t in re.sub( "\s+", " ", articleTag.find("time").text.strip() ).split("-")]
 	date = [ t.split(".") for t in date ]
 	date = [ "-".join( [year, t[1], t[0]] ) for t in date ]
 	[announcementDiv, placeDiv, ageGroupsDiv] = articleTag.find_all("div")
 	announcement = {}
 	place = {}
 	for dt, dd in zip(articleTag.find_all("dt"), articleTag.find_all("dd")):
 		if dt.text.strip() == "Ausschreibung:":
 			announcement['url'] = dd.find("a")['href']
 		if dt.text.strip() == "Veranstalter:":
 			announcement['organizer'] = dd.text.strip()
 		if dt.text.strip() == "Veranstaltungsort:":
 			place['name'] = dd.text.strip()
 		if dt.text.strip() == "Veranstaltungsadresse:":
 			place['address'] = re.sub("\s+", " ", dd.text.strip())
 #		print(dt.text, dd.text)
 	ageGroups = [ ak.text.strip() for ak in ageGroupsDiv.find_all("span") ]
 print(f"title: {title})")
 print(f"date: {date})")
 print(f"announcement: {announcement}")
 print(f"place: {place}")
 print(f"ageGroups: {ageGroups}")
 exit()
 jvsCalPage = requests.get(url)
 jvsCalSoup = BeautifulSoup(jvsCalPage.content, "html.parser")
 #jvsCalEventList = jvsCalSoup.find(id="eventList")
 #print(jvsCalEventList.prettify())
 jvsCalEventListItems = jvsCalSoup.find(id="eventListItems")
 #print(jvsCalEventListItems.prettify())
 jvsCalEventMonts = jvsCalEventListItems.find_all("div", class_="month")
 jvsWkList = []
 for m in jvsCalEventMonts:
 	if m.has_attr("data-month"):
 		yearMonth = m.get("data-month")
 		year = (yearMonth[0:4])
 		month = yearMonth[4:6]
 		print(f"Jahr: {year}, Monat: {month}")
 		events = m.find_all(class_="posts")
 		for event in events:
 				parsedEvent = parseJvsEvent(event, onlyIfWithAk=True)
 				if parsedEvent is not None:
 					jvsWkList.append(parsedEvent)
 				else:
 					print("no data-month")
 for w in jvsWkList:
 	print(w.toJson())