14 # GNU General Public License for more details. |
14 # GNU General Public License for more details. |
15 # |
15 # |
16 # You should have received a copy of the GNU General Public License |
16 # You should have received a copy of the GNU General Public License |
17 # along with this program. If not, see <http://www.gnu.org/licenses/>. |
17 # along with this program. If not, see <http://www.gnu.org/licenses/>. |
18 |
18 |
19 |
19 import bs4 |
20 from html.parser import HTMLParser |
|
21 from pprint import pprint |
20 from pprint import pprint |
22 import xml.dom |
|
23 from itertools import chain, groupby |
21 from itertools import chain, groupby |
24 import logging |
22 import logging |
25 import operator |
23 import operator |
26 |
|
27 class Node: |
|
28 def __init__(self, **args): |
|
29 self.childs = [] |
|
30 self.attrs = {} |
|
31 for arg in args: |
|
32 setattr(self, arg, args[arg]) |
|
33 if self.name == "div" and "class" in self.attrs: |
|
34 self.tag = self.name |
|
35 self.name = self.attrs["class"] |
|
36 del self.attrs["class"] |
|
37 else: |
|
38 self.tag = self.name |
|
39 def format(self, prefix = ""): |
|
40 res = prefix + "<" + self.name |
|
41 for attr in self.attrs: |
|
42 if self.attrs[attr]: |
|
43 res += "\n" + prefix + " " + attr + '="' + self.attrs[attr] + '"' |
|
44 else: |
|
45 res += "\n" + prefix + " " + attr |
|
46 if self.name == "data": |
|
47 res += ">" + self.data + "</" + self.name + ">" |
|
48 elif self.childs: |
|
49 res += ">" |
|
50 for child in self.childs: |
|
51 res += "\n" + child.format(prefix + " ") |
|
52 res += "\n" + prefix + "</" + self.name + ">" |
|
53 else: |
|
54 res += "/>" |
|
55 return res |
|
56 def find(self, prefix): |
|
57 for child in self.childs: |
|
58 if child.name.startswith(prefix): |
|
59 yield child |
|
60 def __getattr__(self, name): |
|
61 for child in self.childs: |
|
62 if child.name == name: |
|
63 setattr(self, name, child) |
|
64 return child |
|
65 raise AttributeError(name) |
|
66 def __repr__(self): |
|
67 return self.format() |
|
68 |
|
69 class BundleParser(HTMLParser): |
|
70 def __init__(self, **args): |
|
71 super(BundleParser, self).__init__(**args) |
|
72 self.dom = Node(name = "root", |
|
73 childs = [], |
|
74 parent = None) |
|
75 self.current = self.dom |
|
76 self.depth = 1 |
|
77 def handle_starttag(self, tag, attrs): |
|
78 # print("+" * self.depth,tag) |
|
79 new = Node(name = tag, |
|
80 attrs = dict(attrs), |
|
81 childs = [], |
|
82 parent = self.current) |
|
83 self.current.childs.append(new) |
|
84 self.current = new |
|
85 self.depth += 1 |
|
86 def handle_endtag(self, tag): |
|
87 while tag != self.current.tag: |
|
88 print("*** Skipping", self.current.tag,"; looking for",tag) |
|
89 self.current = self.current.parent |
|
90 self.depth-=1 |
|
91 # print("-" * self.depth,self.current.tag) |
|
92 assert(self.current != self.dom) |
|
93 assert(self.current.tag == tag) |
|
94 self.depth-=1 |
|
95 # print("-" * self.depth,tag) |
|
96 self.current = self.current.parent |
|
97 def handle_data(self, data): |
|
98 if data.strip(): |
|
99 self.current.childs.append(Node(name = "data", data = data, childs = [])) |
|
100 |
|
101 def findRows(dom): |
|
102 for child in dom.childs: |
|
103 try: |
|
104 if child.name[:4] == "row ": |
|
105 yield child |
|
106 else: |
|
107 for row in findRows(child): |
|
108 yield row |
|
109 except KeyError: |
|
110 pass |
|
111 for row in findRows(child): |
|
112 yield row |
|
113 |
24 |
114 class Download: |
25 class Download: |
115 subst = { "arc32" : ("x86",), |
26 subst = { "arc32" : ("x86",), |
116 "arc64" : ("x64",), |
27 "arc64" : ("x64",), |
117 "i386.deb" : ("x86","deb"), |
28 "i386.deb" : ("x86","deb"), |
128 "32-bit" : ("x86",), |
39 "32-bit" : ("x86",), |
129 "64-bit" : ("x64",), |
40 "64-bit" : ("x64",), |
130 "(HD)" : ("HD",), |
41 "(HD)" : ("HD",), |
131 "(MP3)" : ("MP3",), |
42 "(MP3)" : ("MP3",), |
132 } |
43 } |
133 def __init__(self, dltype, dom): |
44 def __init__(self, dltype, soup): |
134 self.dltype = dltype |
45 self.dltype = dltype |
135 self.id = dom.name[len("download "):] |
46 ids = [attr for attr in soup["class"] if attr != "download"] |
136 button = list(dom.find("flexbtn "))[0] |
47 button = soup.find(class_="flexbtn") |
137 desc = button.span.data.data |
48 desc = button.span.string |
138 self.id += " " + desc |
49 ids.extend(desc.split(" ")) |
|
50 self.id = " ".join(ids) |
139 def cleanup(attr): |
51 def cleanup(attr): |
140 attr = attr.strip() |
52 attr = attr.strip() |
141 if attr not in ("Download","small",""): |
53 if attr not in ("Download","small",""): |
142 for s in self.subst.get(attr,(attr,)): |
54 for s in self.subst.get(attr,(attr,)): |
143 yield s |
55 yield s |
144 self.attrs = set(chain.from_iterable(cleanup(attr) for attr in self.id.split(" "))) |
56 self.attrs = set(chain.from_iterable(cleanup(attr) for attr in ids)) |
145 urls = button.a.attrs |
57 urls = button.a.attrs |
146 self.torrent = urls["data-bt"] |
58 self.torrent = urls["data-bt"] |
147 self.web = urls["data-web"] |
59 self.web = urls["data-web"] |
148 details = dom.dldetails.dlsize |
60 details = soup.find(class_="dldetails").find(class_="dlsize") |
149 if details.childs: |
61 size = details.find(class_="mbs") |
150 self.size = details.span.data.data |
62 md5 = details.find(class_="dlmd5") |
151 self.md5 = details.a.attrs["href"] |
63 date = details.find(class_="dldate") |
152 else: |
64 self.size = size.string if size else "Unknown" |
153 self.size = "Unknown" |
65 self.md5 = md5.string if md5 else "Unknown" |
154 self.md5 = "Unknown" |
66 self.date = date.string if date else "Unknown" |
155 def format(self, prefix=""): |
67 def format(self, prefix=""): |
156 res = prefix + '<download type="' + self.dltype + '" id="' + self.id + '">\n' |
68 res = prefix + '<download type="' + self.dltype + '" id="' + self.id + '">\n' |
157 res += prefix + " <web>" + self.web + "</web>\n" |
69 res += prefix + " <web>" + self.web + "</web>\n" |
158 res += prefix + " <torrent>" + self.torrent + "</torrent>\n" |
70 res += prefix + " <torrent>" + self.torrent + "</torrent>\n" |
159 res += prefix + " <size>" + self.size + "</size>\n" |
71 res += prefix + " <size>" + self.size + "</size>\n" |
160 res += prefix + " <md5>" + self.md5 + "</md5>\n" |
72 res += prefix + " <md5>" + self.md5 + "</md5>\n" |
|
73 res += prefix + " <date>" + self.date + "</date>\n" |
161 res += prefix + "</download>" |
74 res += prefix + "</download>" |
162 return res |
75 return res |
163 def __repr__(self): |
76 def __repr__(self): |
164 return self.format() |
77 return self.format() |
165 |
78 |
166 class Downloads: |
79 class Downloads: |
167 def __init__(self, dom): |
80 def __init__(self, soup): |
168 self.id = dom.name[len("downloads "):].split(" ")[0] |
81 self.id = [class_ for class_ in soup["class"] if class_ != "downloads"][0] |
169 self.elements = [] |
82 self.elements = [] |
170 self.others = [] |
83 self.others = [] |
171 self.addchilds(dom) |
84 self.addchilds(soup) |
172 def addchilds(self, dom): |
85 def addchilds(self, soup): |
173 for child in dom.childs: |
86 for child in soup.children: |
174 if child.name.startswith("downloads"): |
87 if type(child) is not bs4.element.Tag: |
|
88 continue |
|
89 classes = child["class"] if "class" in child.attrs else [] |
|
90 if [True for attr in classes if attr in ("arc-toggle", "downloads")]: |
175 self.addchilds(child) |
91 self.addchilds(child) |
176 elif child.name.startswith("download"): |
92 elif "download" in classes: |
177 self.elements.append(Download(self.id, child)) |
93 self.elements.append(Download(self.id, child)) |
178 elif child.name == "arc-toggle": |
94 elif [True for attr in classes if attr in ("clearfix","label")]: |
179 self.addchilds(child) |
|
180 elif child.name in ("clearfix","label"): |
|
181 pass |
95 pass |
182 else: |
96 else: |
183 self.others.append(child) |
97 self.others.append(child) |
184 def __iter__(self): |
98 def __iter__(self): |
185 return iter(self.elements) |
99 return iter(self.elements) |
197 return res |
111 return res |
198 def __repr__(self): |
112 def __repr__(self): |
199 return self.format() |
113 return self.format() |
200 |
114 |
201 class Game: |
115 class Game: |
202 def __init__(self, dom): |
116 def __init__(self, soup): |
203 self.title = "unknown" |
117 self.title = "unknown" |
204 self.downloads = [] |
118 self.downloads = [] |
205 self.others = [] |
119 self.others = [] |
206 for child in dom.childs: |
120 for child in soup.children: |
207 if child.name == "gameinfo": |
121 if type(child) is not bs4.element.Tag: |
208 self.title = dom.gameinfo.title.a.data.data.strip() |
122 continue |
209 elif child.name.startswith("downloads "): |
123 classes = child["class"] if "class" in child.attrs else [] |
|
124 if "gameinfo" in classes: |
|
125 self.title = child.find(class_="title").a.string.strip() |
|
126 elif "downloads" in classes: |
210 self.downloads.append(Downloads(child)) |
127 self.downloads.append(Downloads(child)) |
211 elif child.name in ["icn", "clearfix"]: |
128 elif [True for attr in classes if attr in ["icn", "clearfix"]]: |
212 pass |
129 pass |
213 else: |
130 else: |
214 self.others.append(child) |
131 self.others.append(child) |
215 def __repr__(self): |
132 def __repr__(self): |
216 res = "<game>\n" |
133 res = "<game>\n" |
226 res += o.format(" ") + "\n" |
143 res += o.format(" ") + "\n" |
227 res += " </others>\n" |
144 res += " </others>\n" |
228 res += "</game>" |
145 res += "</game>" |
229 return res |
146 return res |
230 |
147 |
231 def parseGamesFromDom(dom): |
148 def parseGamesFromSoup(soup): |
232 for row in findRows(dom): |
149 for row in soup.find_all(class_="row"): |
233 yield Game(row) |
150 yield Game(row) |
234 |
151 |
235 def parseGamesFromFile(filename): |
152 def parseGamesFromFile(filename): |
236 parser = BundleParser() |
153 for game in parseGamesFromSoup(bs4.BeautifulSoup(open(filename))): |
237 for l in open(filename): |
|
238 parser.feed(l) |
|
239 for game in parseGamesFromDom(parser.dom): |
|
240 yield game |
154 yield game |
241 |
155 |
242 class FileSelector: |
156 class FileSelector: |
243 def scoreDownload(self, dl): |
157 def scoreDownload(self, dl): |
244 if dl.dltype == "audio": |
158 if dl.dltype == "audio": |
297 for game in parseGamesFromFile(fn): |
211 for game in parseGamesFromFile(fn): |
298 for dls in game.downloads: |
212 for dls in game.downloads: |
299 scores = list(selector(dls)) |
213 scores = list(selector(dls)) |
300 choosen = selectHighestScore(scores) |
214 choosen = selectHighestScore(scores) |
301 for score, dl in scores: |
215 for score, dl in scores: |
302 print("[%s] %2d | %-20s | %-10s | %-25s | %s " % ( |
216 print("[%s] %2d | %-20s | %-15s | %-10s | %-25s | %s " % ( |
303 "*" if dl in choosen else " ", |
217 "*" if dl in choosen else " ", |
304 score, |
218 score, |
305 game.title, |
219 game.title, |
306 dls.id, |
220 dls.id, |
|
221 dls.date, |
307 ", ".join(sorted(dl.attrs)), |
222 ", ".join(sorted(dl.attrs)), |
308 dl.torrent)) |
223 dl.torrent)) |
309 if dl in choosen: |
224 if dl in choosen: |
310 downloads.append(dl) |
225 downloads.append(dl) |
311 if not scores: |
226 if not scores: |