update-hib.py
changeset 9 e3a2bb2bae8d
parent 8 98065a298da0
child 10 d7e256c9aec9
--- a/update-hib.py	Sun Mar 17 22:09:01 2013 -0400
+++ b/update-hib.py	Sun Sep 22 22:07:04 2013 -0400
@@ -16,101 +16,12 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-
-from html.parser import HTMLParser
+import bs4
 from pprint import pprint
-import xml.dom
 from itertools import chain, groupby
 import logging
 import operator
 
-class Node:
-    def __init__(self, **args):
-        self.childs = []
-        self.attrs = {}
-        for arg in args:
-            setattr(self, arg, args[arg])
-        if self.name == "div" and "class" in self.attrs:
-            self.tag = self.name
-            self.name = self.attrs["class"]
-            del self.attrs["class"]
-        else:
-            self.tag = self.name
-    def format(self, prefix = ""):
-        res = prefix + "<" + self.name
-        for attr in self.attrs:
-            if self.attrs[attr]:
-                res += "\n" + prefix + "  " + attr + '="' + self.attrs[attr] + '"'
-            else:
-                res += "\n" + prefix + "  " + attr
-        if self.name == "data":
-            res += ">" + self.data + "</" + self.name + ">"
-        elif self.childs:
-            res += ">"
-            for child in self.childs:
-                res += "\n" + child.format(prefix + "  ")
-            res += "\n" + prefix + "</" + self.name + ">"
-        else:
-            res += "/>"
-        return res
-    def find(self, prefix):
-        for child in self.childs:
-            if child.name.startswith(prefix):
-                yield child
-    def __getattr__(self, name):
-        for child in self.childs:
-            if child.name == name:
-                setattr(self, name, child)
-                return child
-        raise AttributeError(name)
-    def __repr__(self):
-        return self.format()
-
-class BundleParser(HTMLParser):
-    def __init__(self, **args):
-        super(BundleParser, self).__init__(**args)
-        self.dom = Node(name = "root",
-                        childs = [],
-                        parent = None)
-        self.current = self.dom
-        self.depth = 1
-    def handle_starttag(self, tag, attrs):
-        # print("+" * self.depth,tag)
-        new = Node(name = tag,
-                   attrs = dict(attrs),
-                   childs = [],
-                   parent = self.current)
-        self.current.childs.append(new)
-        self.current = new
-        self.depth += 1
-    def handle_endtag(self, tag):
-        while tag != self.current.tag:
-            print("*** Skipping", self.current.tag,"; looking for",tag)
-            self.current = self.current.parent
-            self.depth-=1
-            # print("-" * self.depth,self.current.tag)
-            assert(self.current != self.dom)
-        assert(self.current.tag == tag)
-        self.depth-=1
-        # print("-" * self.depth,tag)
-        self.current = self.current.parent
-    def handle_data(self, data):
-        if data.strip():
-            self.current.childs.append(Node(name = "data", data = data, childs = []))
-
-def findRows(dom):
-    for child in dom.childs:
-        try:
-            if child.name[:4] == "row ":
-                yield child
-            else:
-                for row in findRows(child):
-                    yield row
-        except KeyError:
-            pass
-            for row in findRows(child):
-                yield row
-
 class Download:
     subst = { "arc32"         : ("x86",),
               "arc64"         : ("x64",),
@@ -130,54 +41,57 @@
               "(HD)"          : ("HD",),
               "(MP3)"         : ("MP3",),
               }
-    def __init__(self, dltype, dom):
+    def __init__(self, dltype, soup):
         self.dltype = dltype
-        self.id = dom.name[len("download "):]
-        button = list(dom.find("flexbtn "))[0]
-        desc = button.span.data.data
-        self.id += " " + desc
+        ids = [attr for attr in soup["class"] if attr != "download"]
+        button = soup.find(class_="flexbtn")
+        desc = button.span.string
+        ids.extend(desc.split(" "))
+        self.id = " ".join(ids)
         def cleanup(attr):
             attr = attr.strip()
             if attr not in ("Download","small",""):
                 for s in self.subst.get(attr,(attr,)):
                     yield s
-        self.attrs = set(chain.from_iterable(cleanup(attr) for attr in self.id.split(" ")))
+        self.attrs = set(chain.from_iterable(cleanup(attr) for attr in ids))
         urls = button.a.attrs
         self.torrent = urls["data-bt"]
         self.web = urls["data-web"]
-        details = dom.dldetails.dlsize
-        if details.childs:
-            self.size = details.span.data.data
-            self.md5 = details.a.attrs["href"]
-        else:
-            self.size = "Unknown"
-            self.md5 = "Unknown"
+        details = soup.find(class_="dldetails").find(class_="dlsize")
+        size = details.find(class_="mbs")
+        md5 = details.find(class_="dlmd5")
+        date = details.find(class_="dldate")
+        self.size = size.string if size else "Unknown"
+        self.md5 = md5.string if md5 else "Unknown"
+        self.date = date.string if date else "Unknown"
     def format(self, prefix=""):
         res = prefix + '<download type="' + self.dltype + '" id="' + self.id + '">\n'
         res += prefix + "  <web>" + self.web + "</web>\n"
         res += prefix + "  <torrent>" + self.torrent + "</torrent>\n"
         res += prefix + "  <size>" + self.size + "</size>\n"
         res += prefix + "  <md5>" + self.md5 + "</md5>\n"
+        res += prefix + "  <date>" + self.date + "</date>\n"
         res += prefix + "</download>"
         return res
     def __repr__(self):
         return self.format()
 
 class Downloads:
-    def __init__(self, dom):
-        self.id = dom.name[len("downloads "):].split(" ")[0]
+    def __init__(self, soup):
+        self.id = [class_ for class_ in soup["class"] if class_ != "downloads"][0]
         self.elements = []
         self.others = []
-        self.addchilds(dom)
-    def addchilds(self, dom):
-        for child in dom.childs:
-            if child.name.startswith("downloads"):
+        self.addchilds(soup)
+    def addchilds(self, soup):
+        for child in soup.children:
+            if type(child) is not bs4.element.Tag:
+                continue
+            classes = child["class"] if "class" in child.attrs else []
+            if [True for attr in classes if attr in ("arc-toggle", "downloads")]:
                 self.addchilds(child)
-            elif child.name.startswith("download"):
+            elif "download" in classes:
                 self.elements.append(Download(self.id, child))
-            elif child.name == "arc-toggle":
-                self.addchilds(child)
-            elif child.name in ("clearfix","label"):
+            elif [True for attr in classes if attr in ("clearfix","label")]:
                 pass
             else:
                 self.others.append(child)
@@ -199,16 +113,19 @@
         return self.format()
 
 class Game:
-    def __init__(self, dom):
+    def __init__(self, soup):
         self.title = "unknown"
         self.downloads = []
         self.others = []
-        for child in dom.childs:
-            if child.name == "gameinfo":
-                self.title = dom.gameinfo.title.a.data.data.strip()
-            elif child.name.startswith("downloads "):
+        for child in soup.children:            
+            if type(child) is not bs4.element.Tag:
+                continue
+            classes = child["class"] if "class" in child.attrs else []
+            if "gameinfo" in classes:
+                self.title = child.find(class_="title").a.string.strip()
+            elif "downloads" in classes:
                 self.downloads.append(Downloads(child))
-            elif child.name in ["icn", "clearfix"]:
+            elif [True for attr in classes if attr in ["icn", "clearfix"]]:
                 pass
             else:
                 self.others.append(child)
@@ -228,15 +145,12 @@
         res += "</game>"
         return res
 
-def parseGamesFromDom(dom):
-    for row in findRows(dom):
+def parseGamesFromSoup(soup):
+    for row in soup.find_all(class_="row"):
         yield Game(row)
 
 def parseGamesFromFile(filename):
-    parser = BundleParser()
-    for l in open(filename):
-        parser.feed(l)
-    for game in parseGamesFromDom(parser.dom):
+    for game in parseGamesFromSoup(bs4.BeautifulSoup(open(filename))):
         yield game
 
 class FileSelector:
@@ -299,11 +213,12 @@
             scores = list(selector(dls))
             choosen = selectHighestScore(scores)
             for score, dl in scores:
-                print("[%s] %2d | %-20s | %-10s | %-25s | %s " % (
+                print("[%s] %2d | %-20s | %-15s | %-10s | %-25s | %s " % (
                         "*" if dl in choosen else " ",
                         score,
                         game.title,
                         dls.id,
+                        dls.date,
                         ", ".join(sorted(dl.attrs)),
                         dl.torrent))
                 if dl in choosen: