# HG changeset patch # User Fabien Ninoles # Date 1345998185 14400 # Node ID 1e76c59aa3a605ae934999b9680c58158a166189 Initial version: parse tidy file and select a suitable download url. diff -r 000000000000 -r 1e76c59aa3a6 update-hib.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/update-hib.py Sun Aug 26 12:23:05 2012 -0400 @@ -0,0 +1,255 @@ +#!/usr/bin/python3 + +from html.parser import HTMLParser +from pprint import pprint +import xml.dom +from itertools import chain + +class Node: + def __init__(self, **args): + self.childs = [] + self.attrs = {} + for arg in args: + setattr(self, arg, args[arg]) + if self.name == "div" and "class" in self.attrs: + self.tag = self.name + self.name = self.attrs["class"] + del self.attrs["class"] + else: + self.tag = self.name + def format(self, prefix = ""): + res = prefix + "<" + self.name + for attr in self.attrs: + if self.attrs[attr]: + res += "\n" + prefix + " " + attr + '="' + self.attrs[attr] + '"' + else: + res += "\n" + prefix + " " + attr + if self.name == "data": + res += ">" + self.data + "" + elif self.childs: + res += ">" + for child in self.childs: + res += "\n" + child.format(prefix + " ") + res += "\n" + prefix + "" + else: + res += "/>" + return res + def find(self, prefix): + for child in self.childs: + if child.name.startswith(prefix): + yield child + def __getattr__(self, name): + for child in self.childs: + if child.name == name: + setattr(self, name, child) + return child + raise AttributeError(name) + def __repr__(self): + return self.format() + +class BundleParser(HTMLParser): + def __init__(self, **args): + super(BundleParser, self).__init__(**args) + self.dom = Node(name = "root", + childs = [], + parent = None) + self.current = self.dom + self.depth = 1 + def handle_starttag(self, tag, attrs): + # print("+" * self.depth,tag) + new = Node(name = tag, + attrs = dict(attrs), + childs = [], + parent = self.current) + self.current.childs.append(new) + self.current = new + self.depth += 1 + def handle_endtag(self, tag): + while tag != self.current.tag: + print("*** Skipping", self.current.tag,"; looking for",tag) + self.current = self.current.parent + self.depth-=1 + # print("-" * self.depth,self.current.tag) + assert(self.current != self.dom) + assert(self.current.tag == tag) + self.depth-=1 + # print("-" * self.depth,tag) + self.current = self.current.parent + def handle_data(self, data): + if data.strip(): + self.current.childs.append(Node(name = "data", data = data, childs = [])) + +def findRows(dom): + for child in dom.childs: + try: + if child.name[:4] == "row ": + yield child + else: + for row in findRows(child): + yield row + except KeyError: + pass + for row in findRows(child): + yield row + +class Download: + subst = { "arc32" : ("x86",), + "arc64" : ("x64",), + "i386.deb" : ("x86","deb"), + "x86_64.deb" : ("x64", "deb"), + "i686.rpm" : ("x86", "rpm"), + ".i386.rpm" : ("x86", "rpm"), + "x86_64.rpm" : ("x64", "rpm"), + ".x86_64.rpm" : ("x64", "rpm"), + "i386.tar.gz" : ("x86", "tgz"), + "x86_64.tar.gz" : ("x64", "tgz"), + ".tar.gz" : ("tgz",), + ".deb" : ("deb",), + ".rpm" : ("rpm",), + "32-bit" : ("x86",), + "64-bit" : ("x64",), + } + def __init__(self, dltype, dom): + self.dltype = dltype + self.id = dom.name[len("download "):] + button = list(dom.find("flexbtn "))[0] + desc = button.span.data.data + self.id += " " + desc + def cleanup(attr): + attr = attr.strip() + if attr not in ("Download","small",""): + for s in self.subst.get(attr,(attr,)): + yield s + self.attrs = set(chain.from_iterable(cleanup(attr) for attr in self.id.split(" "))) + urls = button.a.attrs + self.torrent = urls["data-bt"] + self.web = urls["data-web"] + details = dom.dldetails.dlsize + if details.childs: + self.size = details.span.data.data + self.md5 = details.a.attrs["href"] + else: + self.size = "Unknown" + self.md5 = "Unknown" + def score(self): + if self.dltype == "audio": + if "FLAC" in self.attrs: + return 2 + if "MP3" in self.attrs: + return 1 + if "website" in self.attrs: + return -1 + raise Exception("Unknow audio type: %r" % (self.attrs)) + if self.dltype in ("mac","windows"): + return -1 + if self.dltype == "linux": + score = 1 + if "x64" in self.attrs: + score += 1 + if "deb" in self.attrs: + score += 1 + return score + if self.dltype == "android": + return 0 + raise Exception("Unknown dls type: %r" % (self,)) + def format(self, prefix=""): + res = prefix + '\n' + res += prefix + " " + self.web + "\n" + res += prefix + " " + self.torrent + "\n" + res += prefix + " " + self.size + "\n" + res += prefix + " " + self.md5 + "\n" + res += prefix + "" + return res + def __repr__(self): + return self.format() + +class Downloads: + def __init__(self, dom): + self.id = dom.name[len("downloads "):].split(" ")[0] + self.elements = [] + self.others = [] + self.addchilds(dom) + def addchilds(self, dom): + for child in dom.childs: + if child.name.startswith("downloads"): + self.addchilds(child) + elif child.name.startswith("download"): + self.elements.append(Download(self.id, child)) + elif child.name == "arc-toggle": + self.addchilds(child) + elif child.name in ("clearfix","label"): + pass + else: + self.others.append(child) + def __iter__(self): + return iter(self.elements) + def format(self, prefix = ""): + res = prefix + '\n' + if self.elements: + for el in self.elements: + res += el.format(prefix + " ") + "\n" + if self.others: + res += prefix + " \n" + for o in self.others: + res += o.format(prefix + " ") + "\n" + res += prefix + " \n" + res += prefix + "" + return res + def choose(self): + scores = list((dl.score(),dl) for dl in self if dl.score() >= 0) + scores.sort(key = lambda x: x[0], reverse = True) + for s, dl in scores: + return [dl] + return [] + def __repr__(self): + return self.format() + +class Game: + def __init__(self, dom): + self.title = "unknown" + self.downloads = [] + self.others = [] + for child in dom.childs: + if child.name == "gameinfo": + self.title = dom.gameinfo.title.a.data.data.strip() + elif child.name.startswith("downloads "): + self.downloads.append(Downloads(child)) + elif child.name in ["icn", "clearfix"]: + pass + else: + self.others.append(child) + def __repr__(self): + res = "\n" + res += " " + self.title + "\n" + if self.downloads: + res += " \n" + for dl in self.downloads: + res += dl.format(" ") + "\n" + res += " \n" + if self.others: + res += " \n" + for o in self.others: + res += o.format(" ") + "\n" + res += " \n" + res += "" + return res + +def parseGames(dom): + for row in findRows(dom): + yield Game(row) + +parser = BundleParser() +with open("tidy_bundle.html") as f: + for l in f: + parser.feed(l) + +for game in parseGames(parser.dom): + for dls in game.downloads: + choosen = dls.choose() + for dl in dls: + print("%s | %-20s | %-10s | %-25s | %s " % ( + "*" if dl in choosen else " ", + game.title, + dls.id, + ", ".join(sorted(dl.attrs)), + dl.torrent))