From 239eaf13018ede27adba2212c2c783857176e488 Mon Sep 17 00:00:00 2001 From: Fulgen301 Date: Sun, 8 Sep 2019 21:54:39 +0200 Subject: Fix description parsing and convert HTML to markdown --- handlers.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'handlers.py') diff --git a/handlers.py b/handlers.py index 9f48349..6a24a65 100644 --- a/handlers.py +++ b/handlers.py @@ -16,6 +16,7 @@ from .helpers import * import locale, html, string from datetime import datetime from abc import * +from html2text import html2text class Site(ABC): regex = None @@ -52,7 +53,7 @@ class Website(Site): entry.tags = [self.tag] entry.slug = "".join(i for i in html.unescape(m["title"]).lower() if i in string.ascii_letters) entry.updated_at = datetime.strptime(m["updatedAt"], self.date_format) - entry.version = m["version"] + entry.version = m["version"] if "version" in m.groupdict() else "" entry.readonly = True try: @@ -85,20 +86,18 @@ class Website(Site): locale.resetlocale(locale.LC_ALL) - r = requests.get(self.prefix + m["entryURL"]) - d = self.regexes["desc"].match(r.text) - if d and "description" in d.groups(): - entry.description = html.unescape(d["description"]) + r = requests.get(self.prefix + html.unescape(m["entryURL"])) + d = self.regexes["desc"].search(r.text) + if d and "description" in d.groupdict(): + entry.description = html2text(html.unescape(d["description"])) - - if not exists: - session.add(entry) + session.add(entry) session.commit() class CCAN(Website): regexes = { "list" : re.compile(r"<.*?>.*?)\.gif\".*?ccan-view\.pl\?a=view\&i=\d*?)\">(?P.*?)(<I>v</I>(?P<version>.*?))?</A><.*?><A HREF=\"(?P<downloadURL>ccan-dl-auth\.pl/(?P<id>\d*)/.*?)\"><.*?><A HREF=\"ccan-user.pl.*?\">(?P<author>.*?)</A><.*?>\((?P<niveau>\d\.\d)\).*?>(?P<updatedAt>\d\d\.\d\d\.\d\d\ \d\d\:\d\d).*?</TR>"), - "desc" : re.compile(r"<TD>Beschreibung:</TD><TD>(?P<description>.*?)</TD></TR>") + "desc" : re.compile(r"<TD>(?:Beschreibung|Description):</TD><TD>(?P<description>.*?)</TD></TR>") } address = "https://ccan.de/cgi-bin/ccan/ccan-view.pl?a=&sc=tm&so=d&nr=100000&pg=0&ac=ty-ti-ni-tm-rp-ev&reveal=1" @@ -108,7 +107,7 @@ class CCAN(Website): class CC(Website): regexes = { - "list" : re.compile(r"<tr><td align=\"right\">.*?<a href=\"/(?P<entryURL>download\.php\?act=getinfo&dl=\d*?)\">(?P<title>.*?)</a></td><td align=\"right\"><a href=\"(?P<downloadURL>downloads/dl\d*?/.*?)\"><img src=\"picz/dl\.gif\" alt=\"Runterladen\" title=\"Runterladen\" border=\"0\"></a></td><td>(<a href=\"user\.php.*?\">|)(?P<author>.*?)(</a>|)</td>.*?<td style=\"border-right:0px;\">(?P<updatedAt>\d\d\.\d\d\.\d\d\d\d \d{1,2}:\d\d)</td></tr>"), + "list" : re.compile(r"<tr><td align=\"right\">.*?<a href=\"/(?P<entryURL>download\.php\?act=getinfo&dl=\d*?)\">(?P<title>.*?)(?: v(?P<version>.*?)|)</a></td><td align=\"right\"><a href=\"(?P<downloadURL>downloads/dl\d*?/.*?)\"><img src=\"picz/dl\.gif\" alt=\"Runterladen\" title=\"Runterladen\" border=\"0\"></a></td><td>(<a href=\"user\.php.*?\">|)(?P<author>.*?)(</a>|)</td>.*?<td style=\"border-right:0px;\">(?P<updatedAt>\d\d\.\d\d\.\d\d\d\d \d{1,2}:\d\d)</td></tr>"), "desc" : re.compile("") } address = "https://cc-archive.lwrl.de/download.php" -- cgit v1.2.3-54-g00ecf