# Copyright (c) 2018, George Tokmaji # Permission to use, copy, modify, and/or distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. from .helpers import * import locale, html, string from datetime import datetime from abc import * from html2text import html2text class Site(ABC): regex = None @abstractmethod def process(self): return class Website(Site): tag = "" prefix = "" address = "" date_format = "" regexes = { "list" : "", "desc" : "" } def process(self): r = requests.get(self.address) if r: session = DBSession() for m in self.regexes["list"].finditer(r.text): id = ObjectId.from_datetime(datetime.strptime(m["updatedAt"], self.date_format)) try: entry = session.query(Upload).filter_by(id=id).one() exists = True except db.orm.exc.NoResultFound: entry = Upload(id=id) exists = False entry.title = html.unescape(m["title"]) entry.tags = [self.tag] entry.slug = "".join(i for i in html.unescape(m["title"]).lower() if i in string.ascii_letters) entry.updated_at = datetime.strptime(m["updatedAt"], self.date_format) entry.version = m["version"] if "version" in m.groupdict() else None entry.readonly = True try: entry.author = session.query(User).filter_by(name=m["author"]).one() except db.orm.exc.NoResultFound: pass if not exists: downloadURL = self.prefix + m["downloadURL"] try: r = requests.get(downloadURL, stream=True, allow_redirects=True) except requests.exceptions.ConnectionError: continue if not r: continue locale.setlocale(locale.LC_ALL, "C") session.add(File( hash=calculateHashForResource(r).hexdigest(), id=entry.id, name=downloadURL.split("/")[-1], content_type=r.headers.get("Content-Type", "application/octet-stream"), length=int(r.headers["Content-Length"]), date=datetime.strptime(r.headers["Date"], "%a, %d %b %Y %H:%M:%S GMT"), download_url=downloadURL, upload=entry )) locale.resetlocale(locale.LC_ALL) r = requests.get(self.prefix + html.unescape(m["entryURL"])) d = self.regexes["desc"].search(r.text) if d and "description" in d.groupdict(): entry.description = html2text(html.unescape(d["description"])) session.add(entry) session.commit() class CCAN(Website): regexes = { "list" : re.compile(r"<.*?>.*?)\.gif\".*?ccan-view\.pl\?a=view\&i=\d*?)\">(?P.*?)(<I>v</I>(?P<version>.*?))?</A><.*?><A HREF=\"(?P<downloadURL>ccan-dl-auth\.pl/(?P<id>\d*)/.*?)\"><.*?><A HREF=\"ccan-user.pl.*?\">(?P<author>.*?)</A><.*?>\((?P<niveau>\d\.\d)\).*?>(?P<updatedAt>\d\d\.\d\d\.\d\d\ \d\d\:\d\d).*?</TR>"), "desc" : re.compile(r"<TD>(?:Beschreibung|Description):</TD><TD>(?P<description>.*?)</TD></TR>") } address = "https://ccan.de/cgi-bin/ccan/ccan-view.pl?a=&sc=tm&so=d&nr=100000&pg=0&ac=ty-ti-ni-tm-rp-ev&reveal=1" tag = "ccan" prefix = "https://ccan.de/cgi-bin/ccan/" date_format = "%d.%m.%y %H:%M" class CC(Website): regexes = { "list" : re.compile(r"<tr><td align=\"right\">.*?<a href=\"/(?P<entryURL>download\.php\?act=getinfo&dl=\d*?)\">(?P<title>.*?)(?: v(?P<version>.*?)|)</a></td><td align=\"right\"><a href=\"(?P<downloadURL>downloads/dl\d*?/.*?)\"><img src=\"picz/dl\.gif\" alt=\"Runterladen\" title=\"Runterladen\" border=\"0\"></a></td><td>(<a href=\"user\.php.*?\">|)(?P<author>.*?)(</a>|)</td>.*?<td style=\"border-right:0px;\">(?P<updatedAt>\d\d\.\d\d\.\d\d\d\d \d{1,2}:\d\d)</td></tr>"), "desc" : re.compile("") } address = "https://cc-archive.lwrl.de/download.php" tag = "clonk-center" prefix = "https://cc-archive.lwrl.de/" date_format = "%d.%m.%Y %H:%M"