update-hib.py
changeset 0 1e76c59aa3a6
child 1 fb1ab147b2dd
equal deleted inserted replaced
-1:000000000000 0:1e76c59aa3a6
       
     1 #!/usr/bin/python3
       
     2 
       
     3 from html.parser import HTMLParser
       
     4 from pprint import pprint
       
     5 import xml.dom
       
     6 from itertools import chain
       
     7 
       
     8 class Node:
       
     9     def __init__(self, **args):
       
    10         self.childs = []
       
    11         self.attrs = {}
       
    12         for arg in args:
       
    13             setattr(self, arg, args[arg])
       
    14         if self.name == "div" and "class" in self.attrs:
       
    15             self.tag = self.name
       
    16             self.name = self.attrs["class"]
       
    17             del self.attrs["class"]
       
    18         else:
       
    19             self.tag = self.name
       
    20     def format(self, prefix = ""):
       
    21         res = prefix + "<" + self.name
       
    22         for attr in self.attrs:
       
    23             if self.attrs[attr]:
       
    24                 res += "\n" + prefix + "  " + attr + '="' + self.attrs[attr] + '"'
       
    25             else:
       
    26                 res += "\n" + prefix + "  " + attr
       
    27         if self.name == "data":
       
    28             res += ">" + self.data + "</" + self.name + ">"
       
    29         elif self.childs:
       
    30             res += ">"
       
    31             for child in self.childs:
       
    32                 res += "\n" + child.format(prefix + "  ")
       
    33             res += "\n" + prefix + "</" + self.name + ">"
       
    34         else:
       
    35             res += "/>"
       
    36         return res
       
    37     def find(self, prefix):
       
    38         for child in self.childs:
       
    39             if child.name.startswith(prefix):
       
    40                 yield child
       
    41     def __getattr__(self, name):
       
    42         for child in self.childs:
       
    43             if child.name == name:
       
    44                 setattr(self, name, child)
       
    45                 return child
       
    46         raise AttributeError(name)
       
    47     def __repr__(self):
       
    48         return self.format()
       
    49 
       
    50 class BundleParser(HTMLParser):
       
    51     def __init__(self, **args):
       
    52         super(BundleParser, self).__init__(**args)
       
    53         self.dom = Node(name = "root",
       
    54                         childs = [],
       
    55                         parent = None)
       
    56         self.current = self.dom
       
    57         self.depth = 1
       
    58     def handle_starttag(self, tag, attrs):
       
    59         # print("+" * self.depth,tag)
       
    60         new = Node(name = tag,
       
    61                    attrs = dict(attrs),
       
    62                    childs = [],
       
    63                    parent = self.current)
       
    64         self.current.childs.append(new)
       
    65         self.current = new
       
    66         self.depth += 1
       
    67     def handle_endtag(self, tag):
       
    68         while tag != self.current.tag:
       
    69             print("*** Skipping", self.current.tag,"; looking for",tag)
       
    70             self.current = self.current.parent
       
    71             self.depth-=1
       
    72             # print("-" * self.depth,self.current.tag)
       
    73             assert(self.current != self.dom)
       
    74         assert(self.current.tag == tag)
       
    75         self.depth-=1
       
    76         # print("-" * self.depth,tag)
       
    77         self.current = self.current.parent
       
    78     def handle_data(self, data):
       
    79         if data.strip():
       
    80             self.current.childs.append(Node(name = "data", data = data, childs = []))
       
    81 
       
    82 def findRows(dom):
       
    83     for child in dom.childs:
       
    84         try:
       
    85             if child.name[:4] == "row ":
       
    86                 yield child
       
    87             else:
       
    88                 for row in findRows(child):
       
    89                     yield row
       
    90         except KeyError:
       
    91             pass
       
    92             for row in findRows(child):
       
    93                 yield row
       
    94 
       
    95 class Download:
       
    96     subst = { "arc32"         : ("x86",),
       
    97               "arc64"         : ("x64",),
       
    98               "i386.deb"      : ("x86","deb"),
       
    99               "x86_64.deb"    : ("x64", "deb"),
       
   100               "i686.rpm"      : ("x86", "rpm"),
       
   101               ".i386.rpm"     : ("x86", "rpm"),
       
   102               "x86_64.rpm"    : ("x64", "rpm"),
       
   103               ".x86_64.rpm"   : ("x64", "rpm"),
       
   104               "i386.tar.gz"   : ("x86", "tgz"),
       
   105               "x86_64.tar.gz" : ("x64", "tgz"),
       
   106               ".tar.gz"       : ("tgz",),
       
   107               ".deb"          : ("deb",),
       
   108               ".rpm"          : ("rpm",),
       
   109               "32-bit"        : ("x86",),
       
   110               "64-bit"        : ("x64",),              
       
   111               }
       
   112     def __init__(self, dltype, dom):
       
   113         self.dltype = dltype
       
   114         self.id = dom.name[len("download "):]
       
   115         button = list(dom.find("flexbtn "))[0]
       
   116         desc = button.span.data.data
       
   117         self.id += " " + desc
       
   118         def cleanup(attr):
       
   119             attr = attr.strip()
       
   120             if attr not in ("Download","small",""):
       
   121                 for s in self.subst.get(attr,(attr,)):
       
   122                     yield s
       
   123         self.attrs = set(chain.from_iterable(cleanup(attr) for attr in self.id.split(" ")))
       
   124         urls = button.a.attrs
       
   125         self.torrent = urls["data-bt"]
       
   126         self.web = urls["data-web"]
       
   127         details = dom.dldetails.dlsize
       
   128         if details.childs:
       
   129             self.size = details.span.data.data
       
   130             self.md5 = details.a.attrs["href"]
       
   131         else:
       
   132             self.size = "Unknown"
       
   133             self.md5 = "Unknown"
       
   134     def score(self):
       
   135         if self.dltype == "audio":
       
   136             if "FLAC" in self.attrs:
       
   137                 return 2
       
   138             if "MP3" in self.attrs:
       
   139                 return 1
       
   140             if "website" in self.attrs:
       
   141                 return -1
       
   142             raise Exception("Unknow audio type: %r" % (self.attrs))
       
   143         if self.dltype in ("mac","windows"):
       
   144             return -1
       
   145         if self.dltype == "linux":
       
   146             score = 1
       
   147             if "x64" in self.attrs:
       
   148                 score += 1
       
   149             if "deb" in self.attrs:
       
   150                 score += 1
       
   151             return score
       
   152         if self.dltype == "android":
       
   153             return 0
       
   154         raise Exception("Unknown dls type: %r" % (self,))
       
   155     def format(self, prefix=""):
       
   156         res = prefix + '<download id="' + self.id + '">\n'
       
   157         res += prefix + "  <web>" + self.web + "</web>\n"
       
   158         res += prefix + "  <torrent>" + self.torrent + "</torrent>\n"
       
   159         res += prefix + "  <size>" + self.size + "</size>\n"
       
   160         res += prefix + "  <md5>" + self.md5 + "</md5>\n"
       
   161         res += prefix + "</download>"
       
   162         return res
       
   163     def __repr__(self):
       
   164         return self.format()
       
   165 
       
   166 class Downloads:
       
   167     def __init__(self, dom):
       
   168         self.id = dom.name[len("downloads "):].split(" ")[0]
       
   169         self.elements = []
       
   170         self.others = []
       
   171         self.addchilds(dom)
       
   172     def addchilds(self, dom):
       
   173         for child in dom.childs:
       
   174             if child.name.startswith("downloads"):
       
   175                 self.addchilds(child)
       
   176             elif child.name.startswith("download"):
       
   177                 self.elements.append(Download(self.id, child))
       
   178             elif child.name == "arc-toggle":
       
   179                 self.addchilds(child)
       
   180             elif child.name in ("clearfix","label"):
       
   181                 pass
       
   182             else:
       
   183                 self.others.append(child)
       
   184     def __iter__(self):
       
   185         return iter(self.elements)
       
   186     def format(self, prefix = ""):
       
   187         res = prefix + '<downloads id="' + self.id + '">\n'
       
   188         if self.elements:
       
   189             for el in self.elements:
       
   190                 res += el.format(prefix + "  ") + "\n"
       
   191         if self.others:
       
   192             res += prefix + "  <others>\n"
       
   193             for o in self.others:
       
   194                 res += o.format(prefix + "    ") + "\n"
       
   195             res += prefix + "  </others>\n"
       
   196         res += prefix + "</downloads>"
       
   197         return res
       
   198     def choose(self):
       
   199         scores = list((dl.score(),dl) for dl in self if dl.score() >= 0)
       
   200         scores.sort(key = lambda x: x[0], reverse = True)
       
   201         for s, dl in scores:
       
   202             return [dl]
       
   203         return []
       
   204     def __repr__(self):
       
   205         return self.format()
       
   206         
       
   207 class Game:
       
   208     def __init__(self, dom):
       
   209         self.title = "unknown"
       
   210         self.downloads = []
       
   211         self.others = []
       
   212         for child in dom.childs:
       
   213             if child.name == "gameinfo":
       
   214                 self.title = dom.gameinfo.title.a.data.data.strip()
       
   215             elif child.name.startswith("downloads "):
       
   216                 self.downloads.append(Downloads(child))
       
   217             elif child.name in ["icn", "clearfix"]:
       
   218                 pass
       
   219             else:
       
   220                 self.others.append(child)
       
   221     def __repr__(self):
       
   222         res  = "<game>\n"
       
   223         res += "  <title>" + self.title + "</title>\n"
       
   224         if self.downloads:
       
   225             res += "  <downloads>\n"
       
   226             for dl in self.downloads:
       
   227                 res += dl.format("    ") + "\n"
       
   228             res += "  </downloads>\n"
       
   229         if self.others:
       
   230             res += "  <others>\n"
       
   231             for o in self.others:
       
   232                 res += o.format("    ") + "\n"
       
   233             res += "  </others>\n"
       
   234         res += "</game>"
       
   235         return res
       
   236 
       
   237 def parseGames(dom):
       
   238     for row in findRows(dom):
       
   239         yield Game(row)
       
   240 
       
   241 parser = BundleParser()
       
   242 with open("tidy_bundle.html") as f:
       
   243     for l in f:
       
   244         parser.feed(l)
       
   245 
       
   246 for game in parseGames(parser.dom):
       
   247     for dls in game.downloads:
       
   248         choosen = dls.choose()
       
   249         for dl in dls:
       
   250             print("%s | %-20s | %-10s | %-25s | %s " % (
       
   251                     "*" if dl in choosen else " ",
       
   252                     game.title, 
       
   253                     dls.id,
       
   254                     ", ".join(sorted(dl.attrs)),
       
   255                     dl.torrent))