update-hib.py
changeset 9 e3a2bb2bae8d
parent 8 98065a298da0
child 10 d7e256c9aec9
equal deleted inserted replaced
8:98065a298da0 9:e3a2bb2bae8d
    14 # GNU General Public License for more details.
    14 # GNU General Public License for more details.
    15 #
    15 #
    16 # You should have received a copy of the GNU General Public License
    16 # You should have received a copy of the GNU General Public License
    17 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
    18 
    18 
    19 
    19 import bs4
    20 from html.parser import HTMLParser
       
    21 from pprint import pprint
    20 from pprint import pprint
    22 import xml.dom
       
    23 from itertools import chain, groupby
    21 from itertools import chain, groupby
    24 import logging
    22 import logging
    25 import operator
    23 import operator
    26 
       
    27 class Node:
       
    28     def __init__(self, **args):
       
    29         self.childs = []
       
    30         self.attrs = {}
       
    31         for arg in args:
       
    32             setattr(self, arg, args[arg])
       
    33         if self.name == "div" and "class" in self.attrs:
       
    34             self.tag = self.name
       
    35             self.name = self.attrs["class"]
       
    36             del self.attrs["class"]
       
    37         else:
       
    38             self.tag = self.name
       
    39     def format(self, prefix = ""):
       
    40         res = prefix + "<" + self.name
       
    41         for attr in self.attrs:
       
    42             if self.attrs[attr]:
       
    43                 res += "\n" + prefix + "  " + attr + '="' + self.attrs[attr] + '"'
       
    44             else:
       
    45                 res += "\n" + prefix + "  " + attr
       
    46         if self.name == "data":
       
    47             res += ">" + self.data + "</" + self.name + ">"
       
    48         elif self.childs:
       
    49             res += ">"
       
    50             for child in self.childs:
       
    51                 res += "\n" + child.format(prefix + "  ")
       
    52             res += "\n" + prefix + "</" + self.name + ">"
       
    53         else:
       
    54             res += "/>"
       
    55         return res
       
    56     def find(self, prefix):
       
    57         for child in self.childs:
       
    58             if child.name.startswith(prefix):
       
    59                 yield child
       
    60     def __getattr__(self, name):
       
    61         for child in self.childs:
       
    62             if child.name == name:
       
    63                 setattr(self, name, child)
       
    64                 return child
       
    65         raise AttributeError(name)
       
    66     def __repr__(self):
       
    67         return self.format()
       
    68 
       
    69 class BundleParser(HTMLParser):
       
    70     def __init__(self, **args):
       
    71         super(BundleParser, self).__init__(**args)
       
    72         self.dom = Node(name = "root",
       
    73                         childs = [],
       
    74                         parent = None)
       
    75         self.current = self.dom
       
    76         self.depth = 1
       
    77     def handle_starttag(self, tag, attrs):
       
    78         # print("+" * self.depth,tag)
       
    79         new = Node(name = tag,
       
    80                    attrs = dict(attrs),
       
    81                    childs = [],
       
    82                    parent = self.current)
       
    83         self.current.childs.append(new)
       
    84         self.current = new
       
    85         self.depth += 1
       
    86     def handle_endtag(self, tag):
       
    87         while tag != self.current.tag:
       
    88             print("*** Skipping", self.current.tag,"; looking for",tag)
       
    89             self.current = self.current.parent
       
    90             self.depth-=1
       
    91             # print("-" * self.depth,self.current.tag)
       
    92             assert(self.current != self.dom)
       
    93         assert(self.current.tag == tag)
       
    94         self.depth-=1
       
    95         # print("-" * self.depth,tag)
       
    96         self.current = self.current.parent
       
    97     def handle_data(self, data):
       
    98         if data.strip():
       
    99             self.current.childs.append(Node(name = "data", data = data, childs = []))
       
   100 
       
   101 def findRows(dom):
       
   102     for child in dom.childs:
       
   103         try:
       
   104             if child.name[:4] == "row ":
       
   105                 yield child
       
   106             else:
       
   107                 for row in findRows(child):
       
   108                     yield row
       
   109         except KeyError:
       
   110             pass
       
   111             for row in findRows(child):
       
   112                 yield row
       
   113 
    24 
   114 class Download:
    25 class Download:
   115     subst = { "arc32"         : ("x86",),
    26     subst = { "arc32"         : ("x86",),
   116               "arc64"         : ("x64",),
    27               "arc64"         : ("x64",),
   117               "i386.deb"      : ("x86","deb"),
    28               "i386.deb"      : ("x86","deb"),
   128               "32-bit"        : ("x86",),
    39               "32-bit"        : ("x86",),
   129               "64-bit"        : ("x64",),
    40               "64-bit"        : ("x64",),
   130               "(HD)"          : ("HD",),
    41               "(HD)"          : ("HD",),
   131               "(MP3)"         : ("MP3",),
    42               "(MP3)"         : ("MP3",),
   132               }
    43               }
   133     def __init__(self, dltype, dom):
    44     def __init__(self, dltype, soup):
   134         self.dltype = dltype
    45         self.dltype = dltype
   135         self.id = dom.name[len("download "):]
    46         ids = [attr for attr in soup["class"] if attr != "download"]
   136         button = list(dom.find("flexbtn "))[0]
    47         button = soup.find(class_="flexbtn")
   137         desc = button.span.data.data
    48         desc = button.span.string
   138         self.id += " " + desc
    49         ids.extend(desc.split(" "))
       
    50         self.id = " ".join(ids)
   139         def cleanup(attr):
    51         def cleanup(attr):
   140             attr = attr.strip()
    52             attr = attr.strip()
   141             if attr not in ("Download","small",""):
    53             if attr not in ("Download","small",""):
   142                 for s in self.subst.get(attr,(attr,)):
    54                 for s in self.subst.get(attr,(attr,)):
   143                     yield s
    55                     yield s
   144         self.attrs = set(chain.from_iterable(cleanup(attr) for attr in self.id.split(" ")))
    56         self.attrs = set(chain.from_iterable(cleanup(attr) for attr in ids))
   145         urls = button.a.attrs
    57         urls = button.a.attrs
   146         self.torrent = urls["data-bt"]
    58         self.torrent = urls["data-bt"]
   147         self.web = urls["data-web"]
    59         self.web = urls["data-web"]
   148         details = dom.dldetails.dlsize
    60         details = soup.find(class_="dldetails").find(class_="dlsize")
   149         if details.childs:
    61         size = details.find(class_="mbs")
   150             self.size = details.span.data.data
    62         md5 = details.find(class_="dlmd5")
   151             self.md5 = details.a.attrs["href"]
    63         date = details.find(class_="dldate")
   152         else:
    64         self.size = size.string if size else "Unknown"
   153             self.size = "Unknown"
    65         self.md5 = md5.string if md5 else "Unknown"
   154             self.md5 = "Unknown"
    66         self.date = date.string if date else "Unknown"
   155     def format(self, prefix=""):
    67     def format(self, prefix=""):
   156         res = prefix + '<download type="' + self.dltype + '" id="' + self.id + '">\n'
    68         res = prefix + '<download type="' + self.dltype + '" id="' + self.id + '">\n'
   157         res += prefix + "  <web>" + self.web + "</web>\n"
    69         res += prefix + "  <web>" + self.web + "</web>\n"
   158         res += prefix + "  <torrent>" + self.torrent + "</torrent>\n"
    70         res += prefix + "  <torrent>" + self.torrent + "</torrent>\n"
   159         res += prefix + "  <size>" + self.size + "</size>\n"
    71         res += prefix + "  <size>" + self.size + "</size>\n"
   160         res += prefix + "  <md5>" + self.md5 + "</md5>\n"
    72         res += prefix + "  <md5>" + self.md5 + "</md5>\n"
       
    73         res += prefix + "  <date>" + self.date + "</date>\n"
   161         res += prefix + "</download>"
    74         res += prefix + "</download>"
   162         return res
    75         return res
   163     def __repr__(self):
    76     def __repr__(self):
   164         return self.format()
    77         return self.format()
   165 
    78 
   166 class Downloads:
    79 class Downloads:
   167     def __init__(self, dom):
    80     def __init__(self, soup):
   168         self.id = dom.name[len("downloads "):].split(" ")[0]
    81         self.id = [class_ for class_ in soup["class"] if class_ != "downloads"][0]
   169         self.elements = []
    82         self.elements = []
   170         self.others = []
    83         self.others = []
   171         self.addchilds(dom)
    84         self.addchilds(soup)
   172     def addchilds(self, dom):
    85     def addchilds(self, soup):
   173         for child in dom.childs:
    86         for child in soup.children:
   174             if child.name.startswith("downloads"):
    87             if type(child) is not bs4.element.Tag:
       
    88                 continue
       
    89             classes = child["class"] if "class" in child.attrs else []
       
    90             if [True for attr in classes if attr in ("arc-toggle", "downloads")]:
   175                 self.addchilds(child)
    91                 self.addchilds(child)
   176             elif child.name.startswith("download"):
    92             elif "download" in classes:
   177                 self.elements.append(Download(self.id, child))
    93                 self.elements.append(Download(self.id, child))
   178             elif child.name == "arc-toggle":
    94             elif [True for attr in classes if attr in ("clearfix","label")]:
   179                 self.addchilds(child)
       
   180             elif child.name in ("clearfix","label"):
       
   181                 pass
    95                 pass
   182             else:
    96             else:
   183                 self.others.append(child)
    97                 self.others.append(child)
   184     def __iter__(self):
    98     def __iter__(self):
   185         return iter(self.elements)
    99         return iter(self.elements)
   197         return res
   111         return res
   198     def __repr__(self):
   112     def __repr__(self):
   199         return self.format()
   113         return self.format()
   200 
   114 
   201 class Game:
   115 class Game:
   202     def __init__(self, dom):
   116     def __init__(self, soup):
   203         self.title = "unknown"
   117         self.title = "unknown"
   204         self.downloads = []
   118         self.downloads = []
   205         self.others = []
   119         self.others = []
   206         for child in dom.childs:
   120         for child in soup.children:            
   207             if child.name == "gameinfo":
   121             if type(child) is not bs4.element.Tag:
   208                 self.title = dom.gameinfo.title.a.data.data.strip()
   122                 continue
   209             elif child.name.startswith("downloads "):
   123             classes = child["class"] if "class" in child.attrs else []
       
   124             if "gameinfo" in classes:
       
   125                 self.title = child.find(class_="title").a.string.strip()
       
   126             elif "downloads" in classes:
   210                 self.downloads.append(Downloads(child))
   127                 self.downloads.append(Downloads(child))
   211             elif child.name in ["icn", "clearfix"]:
   128             elif [True for attr in classes if attr in ["icn", "clearfix"]]:
   212                 pass
   129                 pass
   213             else:
   130             else:
   214                 self.others.append(child)
   131                 self.others.append(child)
   215     def __repr__(self):
   132     def __repr__(self):
   216         res  = "<game>\n"
   133         res  = "<game>\n"
   226                 res += o.format("    ") + "\n"
   143                 res += o.format("    ") + "\n"
   227             res += "  </others>\n"
   144             res += "  </others>\n"
   228         res += "</game>"
   145         res += "</game>"
   229         return res
   146         return res
   230 
   147 
   231 def parseGamesFromDom(dom):
   148 def parseGamesFromSoup(soup):
   232     for row in findRows(dom):
   149     for row in soup.find_all(class_="row"):
   233         yield Game(row)
   150         yield Game(row)
   234 
   151 
   235 def parseGamesFromFile(filename):
   152 def parseGamesFromFile(filename):
   236     parser = BundleParser()
   153     for game in parseGamesFromSoup(bs4.BeautifulSoup(open(filename))):
   237     for l in open(filename):
       
   238         parser.feed(l)
       
   239     for game in parseGamesFromDom(parser.dom):
       
   240         yield game
   154         yield game
   241 
   155 
   242 class FileSelector:
   156 class FileSelector:
   243     def scoreDownload(self, dl):
   157     def scoreDownload(self, dl):
   244         if dl.dltype == "audio":
   158         if dl.dltype == "audio":
   297     for game in parseGamesFromFile(fn):
   211     for game in parseGamesFromFile(fn):
   298         for dls in game.downloads:
   212         for dls in game.downloads:
   299             scores = list(selector(dls))
   213             scores = list(selector(dls))
   300             choosen = selectHighestScore(scores)
   214             choosen = selectHighestScore(scores)
   301             for score, dl in scores:
   215             for score, dl in scores:
   302                 print("[%s] %2d | %-20s | %-10s | %-25s | %s " % (
   216                 print("[%s] %2d | %-20s | %-15s | %-10s | %-25s | %s " % (
   303                         "*" if dl in choosen else " ",
   217                         "*" if dl in choosen else " ",
   304                         score,
   218                         score,
   305                         game.title,
   219                         game.title,
   306                         dls.id,
   220                         dls.id,
       
   221                         dls.date,
   307                         ", ".join(sorted(dl.attrs)),
   222                         ", ".join(sorted(dl.attrs)),
   308                         dl.torrent))
   223                         dl.torrent))
   309                 if dl in choosen:
   224                 if dl in choosen:
   310                     downloads.append(dl)
   225                     downloads.append(dl)
   311             if not scores:
   226             if not scores: