|
1 #!/usr/bin/python3 |
|
2 |
|
3 from html.parser import HTMLParser |
|
4 from pprint import pprint |
|
5 import xml.dom |
|
6 from itertools import chain |
|
7 |
|
8 class Node: |
|
9 def __init__(self, **args): |
|
10 self.childs = [] |
|
11 self.attrs = {} |
|
12 for arg in args: |
|
13 setattr(self, arg, args[arg]) |
|
14 if self.name == "div" and "class" in self.attrs: |
|
15 self.tag = self.name |
|
16 self.name = self.attrs["class"] |
|
17 del self.attrs["class"] |
|
18 else: |
|
19 self.tag = self.name |
|
20 def format(self, prefix = ""): |
|
21 res = prefix + "<" + self.name |
|
22 for attr in self.attrs: |
|
23 if self.attrs[attr]: |
|
24 res += "\n" + prefix + " " + attr + '="' + self.attrs[attr] + '"' |
|
25 else: |
|
26 res += "\n" + prefix + " " + attr |
|
27 if self.name == "data": |
|
28 res += ">" + self.data + "</" + self.name + ">" |
|
29 elif self.childs: |
|
30 res += ">" |
|
31 for child in self.childs: |
|
32 res += "\n" + child.format(prefix + " ") |
|
33 res += "\n" + prefix + "</" + self.name + ">" |
|
34 else: |
|
35 res += "/>" |
|
36 return res |
|
37 def find(self, prefix): |
|
38 for child in self.childs: |
|
39 if child.name.startswith(prefix): |
|
40 yield child |
|
41 def __getattr__(self, name): |
|
42 for child in self.childs: |
|
43 if child.name == name: |
|
44 setattr(self, name, child) |
|
45 return child |
|
46 raise AttributeError(name) |
|
47 def __repr__(self): |
|
48 return self.format() |
|
49 |
|
50 class BundleParser(HTMLParser): |
|
51 def __init__(self, **args): |
|
52 super(BundleParser, self).__init__(**args) |
|
53 self.dom = Node(name = "root", |
|
54 childs = [], |
|
55 parent = None) |
|
56 self.current = self.dom |
|
57 self.depth = 1 |
|
58 def handle_starttag(self, tag, attrs): |
|
59 # print("+" * self.depth,tag) |
|
60 new = Node(name = tag, |
|
61 attrs = dict(attrs), |
|
62 childs = [], |
|
63 parent = self.current) |
|
64 self.current.childs.append(new) |
|
65 self.current = new |
|
66 self.depth += 1 |
|
67 def handle_endtag(self, tag): |
|
68 while tag != self.current.tag: |
|
69 print("*** Skipping", self.current.tag,"; looking for",tag) |
|
70 self.current = self.current.parent |
|
71 self.depth-=1 |
|
72 # print("-" * self.depth,self.current.tag) |
|
73 assert(self.current != self.dom) |
|
74 assert(self.current.tag == tag) |
|
75 self.depth-=1 |
|
76 # print("-" * self.depth,tag) |
|
77 self.current = self.current.parent |
|
78 def handle_data(self, data): |
|
79 if data.strip(): |
|
80 self.current.childs.append(Node(name = "data", data = data, childs = [])) |
|
81 |
|
82 def findRows(dom): |
|
83 for child in dom.childs: |
|
84 try: |
|
85 if child.name[:4] == "row ": |
|
86 yield child |
|
87 else: |
|
88 for row in findRows(child): |
|
89 yield row |
|
90 except KeyError: |
|
91 pass |
|
92 for row in findRows(child): |
|
93 yield row |
|
94 |
|
95 class Download: |
|
96 subst = { "arc32" : ("x86",), |
|
97 "arc64" : ("x64",), |
|
98 "i386.deb" : ("x86","deb"), |
|
99 "x86_64.deb" : ("x64", "deb"), |
|
100 "i686.rpm" : ("x86", "rpm"), |
|
101 ".i386.rpm" : ("x86", "rpm"), |
|
102 "x86_64.rpm" : ("x64", "rpm"), |
|
103 ".x86_64.rpm" : ("x64", "rpm"), |
|
104 "i386.tar.gz" : ("x86", "tgz"), |
|
105 "x86_64.tar.gz" : ("x64", "tgz"), |
|
106 ".tar.gz" : ("tgz",), |
|
107 ".deb" : ("deb",), |
|
108 ".rpm" : ("rpm",), |
|
109 "32-bit" : ("x86",), |
|
110 "64-bit" : ("x64",), |
|
111 } |
|
112 def __init__(self, dltype, dom): |
|
113 self.dltype = dltype |
|
114 self.id = dom.name[len("download "):] |
|
115 button = list(dom.find("flexbtn "))[0] |
|
116 desc = button.span.data.data |
|
117 self.id += " " + desc |
|
118 def cleanup(attr): |
|
119 attr = attr.strip() |
|
120 if attr not in ("Download","small",""): |
|
121 for s in self.subst.get(attr,(attr,)): |
|
122 yield s |
|
123 self.attrs = set(chain.from_iterable(cleanup(attr) for attr in self.id.split(" "))) |
|
124 urls = button.a.attrs |
|
125 self.torrent = urls["data-bt"] |
|
126 self.web = urls["data-web"] |
|
127 details = dom.dldetails.dlsize |
|
128 if details.childs: |
|
129 self.size = details.span.data.data |
|
130 self.md5 = details.a.attrs["href"] |
|
131 else: |
|
132 self.size = "Unknown" |
|
133 self.md5 = "Unknown" |
|
134 def score(self): |
|
135 if self.dltype == "audio": |
|
136 if "FLAC" in self.attrs: |
|
137 return 2 |
|
138 if "MP3" in self.attrs: |
|
139 return 1 |
|
140 if "website" in self.attrs: |
|
141 return -1 |
|
142 raise Exception("Unknow audio type: %r" % (self.attrs)) |
|
143 if self.dltype in ("mac","windows"): |
|
144 return -1 |
|
145 if self.dltype == "linux": |
|
146 score = 1 |
|
147 if "x64" in self.attrs: |
|
148 score += 1 |
|
149 if "deb" in self.attrs: |
|
150 score += 1 |
|
151 return score |
|
152 if self.dltype == "android": |
|
153 return 0 |
|
154 raise Exception("Unknown dls type: %r" % (self,)) |
|
155 def format(self, prefix=""): |
|
156 res = prefix + '<download id="' + self.id + '">\n' |
|
157 res += prefix + " <web>" + self.web + "</web>\n" |
|
158 res += prefix + " <torrent>" + self.torrent + "</torrent>\n" |
|
159 res += prefix + " <size>" + self.size + "</size>\n" |
|
160 res += prefix + " <md5>" + self.md5 + "</md5>\n" |
|
161 res += prefix + "</download>" |
|
162 return res |
|
163 def __repr__(self): |
|
164 return self.format() |
|
165 |
|
166 class Downloads: |
|
167 def __init__(self, dom): |
|
168 self.id = dom.name[len("downloads "):].split(" ")[0] |
|
169 self.elements = [] |
|
170 self.others = [] |
|
171 self.addchilds(dom) |
|
172 def addchilds(self, dom): |
|
173 for child in dom.childs: |
|
174 if child.name.startswith("downloads"): |
|
175 self.addchilds(child) |
|
176 elif child.name.startswith("download"): |
|
177 self.elements.append(Download(self.id, child)) |
|
178 elif child.name == "arc-toggle": |
|
179 self.addchilds(child) |
|
180 elif child.name in ("clearfix","label"): |
|
181 pass |
|
182 else: |
|
183 self.others.append(child) |
|
184 def __iter__(self): |
|
185 return iter(self.elements) |
|
186 def format(self, prefix = ""): |
|
187 res = prefix + '<downloads id="' + self.id + '">\n' |
|
188 if self.elements: |
|
189 for el in self.elements: |
|
190 res += el.format(prefix + " ") + "\n" |
|
191 if self.others: |
|
192 res += prefix + " <others>\n" |
|
193 for o in self.others: |
|
194 res += o.format(prefix + " ") + "\n" |
|
195 res += prefix + " </others>\n" |
|
196 res += prefix + "</downloads>" |
|
197 return res |
|
198 def choose(self): |
|
199 scores = list((dl.score(),dl) for dl in self if dl.score() >= 0) |
|
200 scores.sort(key = lambda x: x[0], reverse = True) |
|
201 for s, dl in scores: |
|
202 return [dl] |
|
203 return [] |
|
204 def __repr__(self): |
|
205 return self.format() |
|
206 |
|
207 class Game: |
|
208 def __init__(self, dom): |
|
209 self.title = "unknown" |
|
210 self.downloads = [] |
|
211 self.others = [] |
|
212 for child in dom.childs: |
|
213 if child.name == "gameinfo": |
|
214 self.title = dom.gameinfo.title.a.data.data.strip() |
|
215 elif child.name.startswith("downloads "): |
|
216 self.downloads.append(Downloads(child)) |
|
217 elif child.name in ["icn", "clearfix"]: |
|
218 pass |
|
219 else: |
|
220 self.others.append(child) |
|
221 def __repr__(self): |
|
222 res = "<game>\n" |
|
223 res += " <title>" + self.title + "</title>\n" |
|
224 if self.downloads: |
|
225 res += " <downloads>\n" |
|
226 for dl in self.downloads: |
|
227 res += dl.format(" ") + "\n" |
|
228 res += " </downloads>\n" |
|
229 if self.others: |
|
230 res += " <others>\n" |
|
231 for o in self.others: |
|
232 res += o.format(" ") + "\n" |
|
233 res += " </others>\n" |
|
234 res += "</game>" |
|
235 return res |
|
236 |
|
237 def parseGames(dom): |
|
238 for row in findRows(dom): |
|
239 yield Game(row) |
|
240 |
|
241 parser = BundleParser() |
|
242 with open("tidy_bundle.html") as f: |
|
243 for l in f: |
|
244 parser.feed(l) |
|
245 |
|
246 for game in parseGames(parser.dom): |
|
247 for dls in game.downloads: |
|
248 choosen = dls.choose() |
|
249 for dl in dls: |
|
250 print("%s | %-20s | %-10s | %-25s | %s " % ( |
|
251 "*" if dl in choosen else " ", |
|
252 game.title, |
|
253 dls.id, |
|
254 ", ".join(sorted(dl.attrs)), |
|
255 dl.torrent)) |