diff -r 98065a298da0 -r e3a2bb2bae8d update-hib.py --- a/update-hib.py Sun Mar 17 22:09:01 2013 -0400 +++ b/update-hib.py Sun Sep 22 22:07:04 2013 -0400 @@ -16,101 +16,12 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . - -from html.parser import HTMLParser +import bs4 from pprint import pprint -import xml.dom from itertools import chain, groupby import logging import operator -class Node: - def __init__(self, **args): - self.childs = [] - self.attrs = {} - for arg in args: - setattr(self, arg, args[arg]) - if self.name == "div" and "class" in self.attrs: - self.tag = self.name - self.name = self.attrs["class"] - del self.attrs["class"] - else: - self.tag = self.name - def format(self, prefix = ""): - res = prefix + "<" + self.name - for attr in self.attrs: - if self.attrs[attr]: - res += "\n" + prefix + " " + attr + '="' + self.attrs[attr] + '"' - else: - res += "\n" + prefix + " " + attr - if self.name == "data": - res += ">" + self.data + "" - elif self.childs: - res += ">" - for child in self.childs: - res += "\n" + child.format(prefix + " ") - res += "\n" + prefix + "" - else: - res += "/>" - return res - def find(self, prefix): - for child in self.childs: - if child.name.startswith(prefix): - yield child - def __getattr__(self, name): - for child in self.childs: - if child.name == name: - setattr(self, name, child) - return child - raise AttributeError(name) - def __repr__(self): - return self.format() - -class BundleParser(HTMLParser): - def __init__(self, **args): - super(BundleParser, self).__init__(**args) - self.dom = Node(name = "root", - childs = [], - parent = None) - self.current = self.dom - self.depth = 1 - def handle_starttag(self, tag, attrs): - # print("+" * self.depth,tag) - new = Node(name = tag, - attrs = dict(attrs), - childs = [], - parent = self.current) - self.current.childs.append(new) - self.current = new - self.depth += 1 - def handle_endtag(self, tag): - while tag != self.current.tag: - print("*** Skipping", self.current.tag,"; looking for",tag) - self.current = self.current.parent - self.depth-=1 - # print("-" * self.depth,self.current.tag) - assert(self.current != self.dom) - assert(self.current.tag == tag) - self.depth-=1 - # print("-" * self.depth,tag) - self.current = self.current.parent - def handle_data(self, data): - if data.strip(): - self.current.childs.append(Node(name = "data", data = data, childs = [])) - -def findRows(dom): - for child in dom.childs: - try: - if child.name[:4] == "row ": - yield child - else: - for row in findRows(child): - yield row - except KeyError: - pass - for row in findRows(child): - yield row - class Download: subst = { "arc32" : ("x86",), "arc64" : ("x64",), @@ -130,54 +41,57 @@ "(HD)" : ("HD",), "(MP3)" : ("MP3",), } - def __init__(self, dltype, dom): + def __init__(self, dltype, soup): self.dltype = dltype - self.id = dom.name[len("download "):] - button = list(dom.find("flexbtn "))[0] - desc = button.span.data.data - self.id += " " + desc + ids = [attr for attr in soup["class"] if attr != "download"] + button = soup.find(class_="flexbtn") + desc = button.span.string + ids.extend(desc.split(" ")) + self.id = " ".join(ids) def cleanup(attr): attr = attr.strip() if attr not in ("Download","small",""): for s in self.subst.get(attr,(attr,)): yield s - self.attrs = set(chain.from_iterable(cleanup(attr) for attr in self.id.split(" "))) + self.attrs = set(chain.from_iterable(cleanup(attr) for attr in ids)) urls = button.a.attrs self.torrent = urls["data-bt"] self.web = urls["data-web"] - details = dom.dldetails.dlsize - if details.childs: - self.size = details.span.data.data - self.md5 = details.a.attrs["href"] - else: - self.size = "Unknown" - self.md5 = "Unknown" + details = soup.find(class_="dldetails").find(class_="dlsize") + size = details.find(class_="mbs") + md5 = details.find(class_="dlmd5") + date = details.find(class_="dldate") + self.size = size.string if size else "Unknown" + self.md5 = md5.string if md5 else "Unknown" + self.date = date.string if date else "Unknown" def format(self, prefix=""): res = prefix + '\n' res += prefix + " " + self.web + "\n" res += prefix + " " + self.torrent + "\n" res += prefix + " " + self.size + "\n" res += prefix + " " + self.md5 + "\n" + res += prefix + " " + self.date + "\n" res += prefix + "" return res def __repr__(self): return self.format() class Downloads: - def __init__(self, dom): - self.id = dom.name[len("downloads "):].split(" ")[0] + def __init__(self, soup): + self.id = [class_ for class_ in soup["class"] if class_ != "downloads"][0] self.elements = [] self.others = [] - self.addchilds(dom) - def addchilds(self, dom): - for child in dom.childs: - if child.name.startswith("downloads"): + self.addchilds(soup) + def addchilds(self, soup): + for child in soup.children: + if type(child) is not bs4.element.Tag: + continue + classes = child["class"] if "class" in child.attrs else [] + if [True for attr in classes if attr in ("arc-toggle", "downloads")]: self.addchilds(child) - elif child.name.startswith("download"): + elif "download" in classes: self.elements.append(Download(self.id, child)) - elif child.name == "arc-toggle": - self.addchilds(child) - elif child.name in ("clearfix","label"): + elif [True for attr in classes if attr in ("clearfix","label")]: pass else: self.others.append(child) @@ -199,16 +113,19 @@ return self.format() class Game: - def __init__(self, dom): + def __init__(self, soup): self.title = "unknown" self.downloads = [] self.others = [] - for child in dom.childs: - if child.name == "gameinfo": - self.title = dom.gameinfo.title.a.data.data.strip() - elif child.name.startswith("downloads "): + for child in soup.children: + if type(child) is not bs4.element.Tag: + continue + classes = child["class"] if "class" in child.attrs else [] + if "gameinfo" in classes: + self.title = child.find(class_="title").a.string.strip() + elif "downloads" in classes: self.downloads.append(Downloads(child)) - elif child.name in ["icn", "clearfix"]: + elif [True for attr in classes if attr in ["icn", "clearfix"]]: pass else: self.others.append(child) @@ -228,15 +145,12 @@ res += "" return res -def parseGamesFromDom(dom): - for row in findRows(dom): +def parseGamesFromSoup(soup): + for row in soup.find_all(class_="row"): yield Game(row) def parseGamesFromFile(filename): - parser = BundleParser() - for l in open(filename): - parser.feed(l) - for game in parseGamesFromDom(parser.dom): + for game in parseGamesFromSoup(bs4.BeautifulSoup(open(filename))): yield game class FileSelector: @@ -299,11 +213,12 @@ scores = list(selector(dls)) choosen = selectHighestScore(scores) for score, dl in scores: - print("[%s] %2d | %-20s | %-10s | %-25s | %s " % ( + print("[%s] %2d | %-20s | %-15s | %-10s | %-25s | %s " % ( "*" if dl in choosen else " ", score, game.title, dls.id, + dls.date, ", ".join(sorted(dl.attrs)), dl.torrent)) if dl in choosen: