# Copyright (c) 2018, George Tokmaji # Permission to use, copy, modify, and/or distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. from .helpers import * import locale, html, string from datetime import datetime from abc import * class Site(ABC): regex = None @abstractmethod def process(self): return class Parry(Site): def process(self): yield {} class Website(Site): tag = "" prefix = "" address = "" date_format = "" regexes = { "list" : "", "desc" : "" } def process(self): r = requests.get(self.address) if r: i = 0 for m in self.regexes["list"].finditer(r.text): #if i > 10: #break print(m["title"]) id = str(ObjectId.from_datetime(datetime.strptime(m["updatedAt"], self.date_format))) if id not in database["entries"]: entry = { "title" : html.unescape(m["title"]), "voting" : { "sum" : round(float(m["niveau"]), 0) if "niveau" in m.groups() else 0, "count" : 0, "votes" : None }, "tags" : [self.tag], "files" : [], "dependencies" : [], "deleted" : False, "description" : "", "pic" : None, "author" : { "username" : html.unescape(m["author"]) }, "slug" : "", "updatedAt" : datetime.strptime(m["updatedAt"], self.date_format).isoformat(), "__v" : 1, "comments" : None, "id" : id, "__intern" : { "entryURL" : self.prefix + m["entryURL"] } } downloadURL = self.prefix + m["downloadURL"] r = requests.get(downloadURL, stream=True, allow_redirects=True) if not r: continue locale.setlocale(locale.LC_ALL, "C") entry["files"] = [{ "metadata" : { "hashes" : { "sha1" : calculateHashForResource(r).hexdigest() }, "downloadURL" : downloadURL }, "aliases" : None, "deleted" : False, "_id" : entry["id"], "filename" : m["downloadURL"].split("/")[-1], "content-type" : r.headers.get("Content-Type", "application/octet-stream"), "length" : int(r.headers["Content-Length"]), "chunkSize" : 4096, # what is this for "uploadDate" : datetime.strptime(r.headers["Date"], "%a, %d %b %Y %H:%M:%S GMT").isoformat(), } ] locale.setlocale(locale.LC_ALL, "") entry["createdAt"] = entry["updatedAt"] entry["slug"] = "".join(filter(lambda x: x in string.ascii_letters, entry["title"].lower())) r = requests.get(entry["__intern"]["entryURL"]) d = self.regexes["desc"].match(r.text) if d and "description" in d.groups(): entry["description"] = html.unescape(d["description"]) yield entry i += 1 class CCAN(Website): regexes = { "list" : re.compile(r"<.*?>.*?)\.gif\".*?ccan-view\.pl\?a=view\&i=\d*?)\">(?P.*?)(<I>v</I>(?P<version>.*?))?</A><.*?><A HREF=\"(?P<downloadURL>ccan-dl-auth\.pl/(?P<id>\d*)/.*?)\"><.*?><A HREF=\"ccan-user.pl.*?\">(?P<author>.*?)</A><.*?>\((?P<niveau>\d\.\d)\).*?>(?P<updatedAt>\d\d\.\d\d\.\d\d\ \d\d\:\d\d).*?</TR>"), "desc" : re.compile(r"<TD>Beschreibung:</TD><TD>(?P<description>.*?)</TD></TR>") } address = "https://ccan.de/cgi-bin/ccan/ccan-view.pl?a=&sc=tm&so=d&nr=100000&pg=0&ac=ty-ti-ni-tm-rp-ev&reveal=1" tag = "ccan" prefix = "https://ccan.de/cgi-bin/ccan/" date_format = "%d.%m.%y %H:%M" class CC(Website): regexes = { "list" : re.compile(r"<tr><td align=\"right\">.*?<a href=\"/(?P<entryURL>download\.php\?act=getinfo&dl=\d*?)\">(?P<title>.*?)</a></td><td align=\"right\"><a href=\"(?P<downloadURL>downloads/dl\d*?/.*?)\"><img src=\"picz/dl\.gif\" alt=\"Runterladen\" title=\"Runterladen\" border=\"0\"></a></td><td>(<a href=\"user\.php.*?\">|)(?P<author>.*?)(</a>|)</td>.*?<td style=\"border-right:0px;\">(?P<updatedAt>\d\d\.\d\d\.\d\d\d\d \d{1,2}:\d\d)</td></tr>"), "desc" : re.compile("") } address = "https://cc-archive.lwrl.de/download.php" tag = "clonk-center" prefix = "https://cc-archive.lwrl.de/" date_format = "%d.%m.%Y %H:%M"