--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/update-hib.py Sun Aug 26 12:23:05 2012 -0400
@@ -0,0 +1,255 @@
+#!/usr/bin/python3
+
+from html.parser import HTMLParser
+from pprint import pprint
+import xml.dom
+from itertools import chain
+
+class Node:
+ def __init__(self, **args):
+ self.childs = []
+ self.attrs = {}
+ for arg in args:
+ setattr(self, arg, args[arg])
+ if self.name == "div" and "class" in self.attrs:
+ self.tag = self.name
+ self.name = self.attrs["class"]
+ del self.attrs["class"]
+ else:
+ self.tag = self.name
+ def format(self, prefix = ""):
+ res = prefix + "<" + self.name
+ for attr in self.attrs:
+ if self.attrs[attr]:
+ res += "\n" + prefix + " " + attr + '="' + self.attrs[attr] + '"'
+ else:
+ res += "\n" + prefix + " " + attr
+ if self.name == "data":
+ res += ">" + self.data + "</" + self.name + ">"
+ elif self.childs:
+ res += ">"
+ for child in self.childs:
+ res += "\n" + child.format(prefix + " ")
+ res += "\n" + prefix + "</" + self.name + ">"
+ else:
+ res += "/>"
+ return res
+ def find(self, prefix):
+ for child in self.childs:
+ if child.name.startswith(prefix):
+ yield child
+ def __getattr__(self, name):
+ for child in self.childs:
+ if child.name == name:
+ setattr(self, name, child)
+ return child
+ raise AttributeError(name)
+ def __repr__(self):
+ return self.format()
+
+class BundleParser(HTMLParser):
+ def __init__(self, **args):
+ super(BundleParser, self).__init__(**args)
+ self.dom = Node(name = "root",
+ childs = [],
+ parent = None)
+ self.current = self.dom
+ self.depth = 1
+ def handle_starttag(self, tag, attrs):
+ # print("+" * self.depth,tag)
+ new = Node(name = tag,
+ attrs = dict(attrs),
+ childs = [],
+ parent = self.current)
+ self.current.childs.append(new)
+ self.current = new
+ self.depth += 1
+ def handle_endtag(self, tag):
+ while tag != self.current.tag:
+ print("*** Skipping", self.current.tag,"; looking for",tag)
+ self.current = self.current.parent
+ self.depth-=1
+ # print("-" * self.depth,self.current.tag)
+ assert(self.current != self.dom)
+ assert(self.current.tag == tag)
+ self.depth-=1
+ # print("-" * self.depth,tag)
+ self.current = self.current.parent
+ def handle_data(self, data):
+ if data.strip():
+ self.current.childs.append(Node(name = "data", data = data, childs = []))
+
+def findRows(dom):
+ for child in dom.childs:
+ try:
+ if child.name[:4] == "row ":
+ yield child
+ else:
+ for row in findRows(child):
+ yield row
+ except KeyError:
+ pass
+ for row in findRows(child):
+ yield row
+
+class Download:
+ subst = { "arc32" : ("x86",),
+ "arc64" : ("x64",),
+ "i386.deb" : ("x86","deb"),
+ "x86_64.deb" : ("x64", "deb"),
+ "i686.rpm" : ("x86", "rpm"),
+ ".i386.rpm" : ("x86", "rpm"),
+ "x86_64.rpm" : ("x64", "rpm"),
+ ".x86_64.rpm" : ("x64", "rpm"),
+ "i386.tar.gz" : ("x86", "tgz"),
+ "x86_64.tar.gz" : ("x64", "tgz"),
+ ".tar.gz" : ("tgz",),
+ ".deb" : ("deb",),
+ ".rpm" : ("rpm",),
+ "32-bit" : ("x86",),
+ "64-bit" : ("x64",),
+ }
+ def __init__(self, dltype, dom):
+ self.dltype = dltype
+ self.id = dom.name[len("download "):]
+ button = list(dom.find("flexbtn "))[0]
+ desc = button.span.data.data
+ self.id += " " + desc
+ def cleanup(attr):
+ attr = attr.strip()
+ if attr not in ("Download","small",""):
+ for s in self.subst.get(attr,(attr,)):
+ yield s
+ self.attrs = set(chain.from_iterable(cleanup(attr) for attr in self.id.split(" ")))
+ urls = button.a.attrs
+ self.torrent = urls["data-bt"]
+ self.web = urls["data-web"]
+ details = dom.dldetails.dlsize
+ if details.childs:
+ self.size = details.span.data.data
+ self.md5 = details.a.attrs["href"]
+ else:
+ self.size = "Unknown"
+ self.md5 = "Unknown"
+ def score(self):
+ if self.dltype == "audio":
+ if "FLAC" in self.attrs:
+ return 2
+ if "MP3" in self.attrs:
+ return 1
+ if "website" in self.attrs:
+ return -1
+ raise Exception("Unknow audio type: %r" % (self.attrs))
+ if self.dltype in ("mac","windows"):
+ return -1
+ if self.dltype == "linux":
+ score = 1
+ if "x64" in self.attrs:
+ score += 1
+ if "deb" in self.attrs:
+ score += 1
+ return score
+ if self.dltype == "android":
+ return 0
+ raise Exception("Unknown dls type: %r" % (self,))
+ def format(self, prefix=""):
+ res = prefix + '<download id="' + self.id + '">\n'
+ res += prefix + " <web>" + self.web + "</web>\n"
+ res += prefix + " <torrent>" + self.torrent + "</torrent>\n"
+ res += prefix + " <size>" + self.size + "</size>\n"
+ res += prefix + " <md5>" + self.md5 + "</md5>\n"
+ res += prefix + "</download>"
+ return res
+ def __repr__(self):
+ return self.format()
+
+class Downloads:
+ def __init__(self, dom):
+ self.id = dom.name[len("downloads "):].split(" ")[0]
+ self.elements = []
+ self.others = []
+ self.addchilds(dom)
+ def addchilds(self, dom):
+ for child in dom.childs:
+ if child.name.startswith("downloads"):
+ self.addchilds(child)
+ elif child.name.startswith("download"):
+ self.elements.append(Download(self.id, child))
+ elif child.name == "arc-toggle":
+ self.addchilds(child)
+ elif child.name in ("clearfix","label"):
+ pass
+ else:
+ self.others.append(child)
+ def __iter__(self):
+ return iter(self.elements)
+ def format(self, prefix = ""):
+ res = prefix + '<downloads id="' + self.id + '">\n'
+ if self.elements:
+ for el in self.elements:
+ res += el.format(prefix + " ") + "\n"
+ if self.others:
+ res += prefix + " <others>\n"
+ for o in self.others:
+ res += o.format(prefix + " ") + "\n"
+ res += prefix + " </others>\n"
+ res += prefix + "</downloads>"
+ return res
+ def choose(self):
+ scores = list((dl.score(),dl) for dl in self if dl.score() >= 0)
+ scores.sort(key = lambda x: x[0], reverse = True)
+ for s, dl in scores:
+ return [dl]
+ return []
+ def __repr__(self):
+ return self.format()
+
+class Game:
+ def __init__(self, dom):
+ self.title = "unknown"
+ self.downloads = []
+ self.others = []
+ for child in dom.childs:
+ if child.name == "gameinfo":
+ self.title = dom.gameinfo.title.a.data.data.strip()
+ elif child.name.startswith("downloads "):
+ self.downloads.append(Downloads(child))
+ elif child.name in ["icn", "clearfix"]:
+ pass
+ else:
+ self.others.append(child)
+ def __repr__(self):
+ res = "<game>\n"
+ res += " <title>" + self.title + "</title>\n"
+ if self.downloads:
+ res += " <downloads>\n"
+ for dl in self.downloads:
+ res += dl.format(" ") + "\n"
+ res += " </downloads>\n"
+ if self.others:
+ res += " <others>\n"
+ for o in self.others:
+ res += o.format(" ") + "\n"
+ res += " </others>\n"
+ res += "</game>"
+ return res
+
+def parseGames(dom):
+ for row in findRows(dom):
+ yield Game(row)
+
+parser = BundleParser()
+with open("tidy_bundle.html") as f:
+ for l in f:
+ parser.feed(l)
+
+for game in parseGames(parser.dom):
+ for dls in game.downloads:
+ choosen = dls.choose()
+ for dl in dls:
+ print("%s | %-20s | %-10s | %-25s | %s " % (
+ "*" if dl in choosen else " ",
+ game.title,
+ dls.id,
+ ", ".join(sorted(dl.attrs)),
+ dl.torrent))