1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
|
# Copyright (c) 2018, George Tokmaji
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
from .helpers import *
import locale, html, string
from datetime import datetime
from abc import *
class Site(ABC):
regex = None
@abstractmethod
def process(self):
return
class Website(Site):
tag = ""
prefix = ""
address = ""
date_format = ""
regexes = {
"list" : "",
"desc" : ""
}
def process(self):
r = requests.get(self.address)
if r:
session = DBSession()
for m in self.regexes["list"].finditer(r.text):
id = ObjectId.from_datetime(datetime.strptime(m["updatedAt"], self.date_format))
try:
entry = session.query(Upload).filter_by(id=id).one()
exists = True
continue
except db.orm.exc.NoResultFound:
entry = Upload(id=id)
exists = False
entry.title = html.unescape(m["title"])
entry.tags = [self.tag]
entry.slug = "".join(i for i in html.unescape(m["title"]).lower() if i in string.ascii_letters)
entry.updated_at = datetime.strptime(m["updatedAt"], self.date_format)
entry._v = 1
entry.readonly = True
try:
entry.author = session.query(User).filter_by(name=m["author"]).one()
except db.orm.exc.NoResultFound:
pass
if not exists:
downloadURL = self.prefix + m["downloadURL"]
try:
r = requests.get(downloadURL, stream=True, allow_redirects=True)
except requests.exceptions.ConnectionError:
continue
if not r:
continue
locale.setlocale(locale.LC_ALL, "C")
session.add(File(
hash=calculateHashForResource(r).hexdigest(),
id=entry.id,
name=downloadURL.split("/")[-1],
content_type=r.headers.get("Content-Type", "application/octet-stream"),
length=int(r.headers["Content-Length"]),
date=datetime.strptime(r.headers["Date"], "%a, %d %b %Y %H:%M:%S GMT"),
download_url=downloadURL,
upload=entry
))
locale.setlocale(locale.LC_ALL, "")
r = requests.get(self.prefix + m["entryURL"])
d = self.regexes["desc"].match(r.text)
if d and "description" in d.groups():
entry.description = html.unescape(d["description"])
if not exists:
session.add(entry)
session.commit()
class CCAN(Website):
regexes = {
"list" : re.compile(r"<TR.*?><.*?><IMG SRC=\"/img/type-(?P<type>.*?)\.gif\".*?<A HREF=\"(?P<entryURL>ccan-view\.pl\?a=view\&i=\d*?)\">(?P<title>.*?)(<I>v</I>(?P<version>.*?))?</A><.*?><A HREF=\"(?P<downloadURL>ccan-dl-auth\.pl/(?P<id>\d*)/.*?)\"><.*?><A HREF=\"ccan-user.pl.*?\">(?P<author>.*?)</A><.*?>\((?P<niveau>\d\.\d)\).*?>(?P<updatedAt>\d\d\.\d\d\.\d\d\ \d\d\:\d\d).*?</TR>"),
"desc" : re.compile(r"<TD>Beschreibung:</TD><TD>(?P<description>.*?)</TD></TR>")
}
address = "https://ccan.de/cgi-bin/ccan/ccan-view.pl?a=&sc=tm&so=d&nr=100000&pg=0&ac=ty-ti-ni-tm-rp-ev&reveal=1"
tag = "ccan"
prefix = "https://ccan.de/cgi-bin/ccan/"
date_format = "%d.%m.%y %H:%M"
class CC(Website):
regexes = {
"list" : re.compile(r"<tr><td align=\"right\">.*?<a href=\"/(?P<entryURL>download\.php\?act=getinfo&dl=\d*?)\">(?P<title>.*?)</a></td><td align=\"right\"><a href=\"(?P<downloadURL>downloads/dl\d*?/.*?)\"><img src=\"picz/dl\.gif\" alt=\"Runterladen\" title=\"Runterladen\" border=\"0\"></a></td><td>(<a href=\"user\.php.*?\">|)(?P<author>.*?)(</a>|)</td>.*?<td style=\"border-right:0px;\">(?P<updatedAt>\d\d\.\d\d\.\d\d\d\d \d{1,2}:\d\d)</td></tr>"),
"desc" : re.compile("")
}
address = "https://cc-archive.lwrl.de/download.php"
tag = "clonk-center"
prefix = "https://cc-archive.lwrl.de/"
date_format = "%d.%m.%Y %H:%M"
|