update-hib.py
changeset 0 1e76c59aa3a6
child 1 fb1ab147b2dd
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/update-hib.py	Sun Aug 26 12:23:05 2012 -0400
@@ -0,0 +1,255 @@
+#!/usr/bin/python3
+
+from html.parser import HTMLParser
+from pprint import pprint
+import xml.dom
+from itertools import chain
+
+class Node:
+    def __init__(self, **args):
+        self.childs = []
+        self.attrs = {}
+        for arg in args:
+            setattr(self, arg, args[arg])
+        if self.name == "div" and "class" in self.attrs:
+            self.tag = self.name
+            self.name = self.attrs["class"]
+            del self.attrs["class"]
+        else:
+            self.tag = self.name
+    def format(self, prefix = ""):
+        res = prefix + "<" + self.name
+        for attr in self.attrs:
+            if self.attrs[attr]:
+                res += "\n" + prefix + "  " + attr + '="' + self.attrs[attr] + '"'
+            else:
+                res += "\n" + prefix + "  " + attr
+        if self.name == "data":
+            res += ">" + self.data + "</" + self.name + ">"
+        elif self.childs:
+            res += ">"
+            for child in self.childs:
+                res += "\n" + child.format(prefix + "  ")
+            res += "\n" + prefix + "</" + self.name + ">"
+        else:
+            res += "/>"
+        return res
+    def find(self, prefix):
+        for child in self.childs:
+            if child.name.startswith(prefix):
+                yield child
+    def __getattr__(self, name):
+        for child in self.childs:
+            if child.name == name:
+                setattr(self, name, child)
+                return child
+        raise AttributeError(name)
+    def __repr__(self):
+        return self.format()
+
+class BundleParser(HTMLParser):
+    def __init__(self, **args):
+        super(BundleParser, self).__init__(**args)
+        self.dom = Node(name = "root",
+                        childs = [],
+                        parent = None)
+        self.current = self.dom
+        self.depth = 1
+    def handle_starttag(self, tag, attrs):
+        # print("+" * self.depth,tag)
+        new = Node(name = tag,
+                   attrs = dict(attrs),
+                   childs = [],
+                   parent = self.current)
+        self.current.childs.append(new)
+        self.current = new
+        self.depth += 1
+    def handle_endtag(self, tag):
+        while tag != self.current.tag:
+            print("*** Skipping", self.current.tag,"; looking for",tag)
+            self.current = self.current.parent
+            self.depth-=1
+            # print("-" * self.depth,self.current.tag)
+            assert(self.current != self.dom)
+        assert(self.current.tag == tag)
+        self.depth-=1
+        # print("-" * self.depth,tag)
+        self.current = self.current.parent
+    def handle_data(self, data):
+        if data.strip():
+            self.current.childs.append(Node(name = "data", data = data, childs = []))
+
+def findRows(dom):
+    for child in dom.childs:
+        try:
+            if child.name[:4] == "row ":
+                yield child
+            else:
+                for row in findRows(child):
+                    yield row
+        except KeyError:
+            pass
+            for row in findRows(child):
+                yield row
+
+class Download:
+    subst = { "arc32"         : ("x86",),
+              "arc64"         : ("x64",),
+              "i386.deb"      : ("x86","deb"),
+              "x86_64.deb"    : ("x64", "deb"),
+              "i686.rpm"      : ("x86", "rpm"),
+              ".i386.rpm"     : ("x86", "rpm"),
+              "x86_64.rpm"    : ("x64", "rpm"),
+              ".x86_64.rpm"   : ("x64", "rpm"),
+              "i386.tar.gz"   : ("x86", "tgz"),
+              "x86_64.tar.gz" : ("x64", "tgz"),
+              ".tar.gz"       : ("tgz",),
+              ".deb"          : ("deb",),
+              ".rpm"          : ("rpm",),
+              "32-bit"        : ("x86",),
+              "64-bit"        : ("x64",),              
+              }
+    def __init__(self, dltype, dom):
+        self.dltype = dltype
+        self.id = dom.name[len("download "):]
+        button = list(dom.find("flexbtn "))[0]
+        desc = button.span.data.data
+        self.id += " " + desc
+        def cleanup(attr):
+            attr = attr.strip()
+            if attr not in ("Download","small",""):
+                for s in self.subst.get(attr,(attr,)):
+                    yield s
+        self.attrs = set(chain.from_iterable(cleanup(attr) for attr in self.id.split(" ")))
+        urls = button.a.attrs
+        self.torrent = urls["data-bt"]
+        self.web = urls["data-web"]
+        details = dom.dldetails.dlsize
+        if details.childs:
+            self.size = details.span.data.data
+            self.md5 = details.a.attrs["href"]
+        else:
+            self.size = "Unknown"
+            self.md5 = "Unknown"
+    def score(self):
+        if self.dltype == "audio":
+            if "FLAC" in self.attrs:
+                return 2
+            if "MP3" in self.attrs:
+                return 1
+            if "website" in self.attrs:
+                return -1
+            raise Exception("Unknow audio type: %r" % (self.attrs))
+        if self.dltype in ("mac","windows"):
+            return -1
+        if self.dltype == "linux":
+            score = 1
+            if "x64" in self.attrs:
+                score += 1
+            if "deb" in self.attrs:
+                score += 1
+            return score
+        if self.dltype == "android":
+            return 0
+        raise Exception("Unknown dls type: %r" % (self,))
+    def format(self, prefix=""):
+        res = prefix + '<download id="' + self.id + '">\n'
+        res += prefix + "  <web>" + self.web + "</web>\n"
+        res += prefix + "  <torrent>" + self.torrent + "</torrent>\n"
+        res += prefix + "  <size>" + self.size + "</size>\n"
+        res += prefix + "  <md5>" + self.md5 + "</md5>\n"
+        res += prefix + "</download>"
+        return res
+    def __repr__(self):
+        return self.format()
+
+class Downloads:
+    def __init__(self, dom):
+        self.id = dom.name[len("downloads "):].split(" ")[0]
+        self.elements = []
+        self.others = []
+        self.addchilds(dom)
+    def addchilds(self, dom):
+        for child in dom.childs:
+            if child.name.startswith("downloads"):
+                self.addchilds(child)
+            elif child.name.startswith("download"):
+                self.elements.append(Download(self.id, child))
+            elif child.name == "arc-toggle":
+                self.addchilds(child)
+            elif child.name in ("clearfix","label"):
+                pass
+            else:
+                self.others.append(child)
+    def __iter__(self):
+        return iter(self.elements)
+    def format(self, prefix = ""):
+        res = prefix + '<downloads id="' + self.id + '">\n'
+        if self.elements:
+            for el in self.elements:
+                res += el.format(prefix + "  ") + "\n"
+        if self.others:
+            res += prefix + "  <others>\n"
+            for o in self.others:
+                res += o.format(prefix + "    ") + "\n"
+            res += prefix + "  </others>\n"
+        res += prefix + "</downloads>"
+        return res
+    def choose(self):
+        scores = list((dl.score(),dl) for dl in self if dl.score() >= 0)
+        scores.sort(key = lambda x: x[0], reverse = True)
+        for s, dl in scores:
+            return [dl]
+        return []
+    def __repr__(self):
+        return self.format()
+        
+class Game:
+    def __init__(self, dom):
+        self.title = "unknown"
+        self.downloads = []
+        self.others = []
+        for child in dom.childs:
+            if child.name == "gameinfo":
+                self.title = dom.gameinfo.title.a.data.data.strip()
+            elif child.name.startswith("downloads "):
+                self.downloads.append(Downloads(child))
+            elif child.name in ["icn", "clearfix"]:
+                pass
+            else:
+                self.others.append(child)
+    def __repr__(self):
+        res  = "<game>\n"
+        res += "  <title>" + self.title + "</title>\n"
+        if self.downloads:
+            res += "  <downloads>\n"
+            for dl in self.downloads:
+                res += dl.format("    ") + "\n"
+            res += "  </downloads>\n"
+        if self.others:
+            res += "  <others>\n"
+            for o in self.others:
+                res += o.format("    ") + "\n"
+            res += "  </others>\n"
+        res += "</game>"
+        return res
+
+def parseGames(dom):
+    for row in findRows(dom):
+        yield Game(row)
+
+parser = BundleParser()
+with open("tidy_bundle.html") as f:
+    for l in f:
+        parser.feed(l)
+
+for game in parseGames(parser.dom):
+    for dls in game.downloads:
+        choosen = dls.choose()
+        for dl in dls:
+            print("%s | %-20s | %-10s | %-25s | %s " % (
+                    "*" if dl in choosen else " ",
+                    game.title, 
+                    dls.id,
+                    ", ".join(sorted(dl.attrs)),
+                    dl.torrent))