diff -r 98065a298da0 -r e3a2bb2bae8d update-hib.py
--- a/update-hib.py Sun Mar 17 22:09:01 2013 -0400
+++ b/update-hib.py Sun Sep 22 22:07:04 2013 -0400
@@ -16,101 +16,12 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
-
-from html.parser import HTMLParser
+import bs4
from pprint import pprint
-import xml.dom
from itertools import chain, groupby
import logging
import operator
-class Node:
- def __init__(self, **args):
- self.childs = []
- self.attrs = {}
- for arg in args:
- setattr(self, arg, args[arg])
- if self.name == "div" and "class" in self.attrs:
- self.tag = self.name
- self.name = self.attrs["class"]
- del self.attrs["class"]
- else:
- self.tag = self.name
- def format(self, prefix = ""):
- res = prefix + "<" + self.name
- for attr in self.attrs:
- if self.attrs[attr]:
- res += "\n" + prefix + " " + attr + '="' + self.attrs[attr] + '"'
- else:
- res += "\n" + prefix + " " + attr
- if self.name == "data":
- res += ">" + self.data + "" + self.name + ">"
- elif self.childs:
- res += ">"
- for child in self.childs:
- res += "\n" + child.format(prefix + " ")
- res += "\n" + prefix + "" + self.name + ">"
- else:
- res += "/>"
- return res
- def find(self, prefix):
- for child in self.childs:
- if child.name.startswith(prefix):
- yield child
- def __getattr__(self, name):
- for child in self.childs:
- if child.name == name:
- setattr(self, name, child)
- return child
- raise AttributeError(name)
- def __repr__(self):
- return self.format()
-
-class BundleParser(HTMLParser):
- def __init__(self, **args):
- super(BundleParser, self).__init__(**args)
- self.dom = Node(name = "root",
- childs = [],
- parent = None)
- self.current = self.dom
- self.depth = 1
- def handle_starttag(self, tag, attrs):
- # print("+" * self.depth,tag)
- new = Node(name = tag,
- attrs = dict(attrs),
- childs = [],
- parent = self.current)
- self.current.childs.append(new)
- self.current = new
- self.depth += 1
- def handle_endtag(self, tag):
- while tag != self.current.tag:
- print("*** Skipping", self.current.tag,"; looking for",tag)
- self.current = self.current.parent
- self.depth-=1
- # print("-" * self.depth,self.current.tag)
- assert(self.current != self.dom)
- assert(self.current.tag == tag)
- self.depth-=1
- # print("-" * self.depth,tag)
- self.current = self.current.parent
- def handle_data(self, data):
- if data.strip():
- self.current.childs.append(Node(name = "data", data = data, childs = []))
-
-def findRows(dom):
- for child in dom.childs:
- try:
- if child.name[:4] == "row ":
- yield child
- else:
- for row in findRows(child):
- yield row
- except KeyError:
- pass
- for row in findRows(child):
- yield row
-
class Download:
subst = { "arc32" : ("x86",),
"arc64" : ("x64",),
@@ -130,54 +41,57 @@
"(HD)" : ("HD",),
"(MP3)" : ("MP3",),
}
- def __init__(self, dltype, dom):
+ def __init__(self, dltype, soup):
self.dltype = dltype
- self.id = dom.name[len("download "):]
- button = list(dom.find("flexbtn "))[0]
- desc = button.span.data.data
- self.id += " " + desc
+ ids = [attr for attr in soup["class"] if attr != "download"]
+ button = soup.find(class_="flexbtn")
+ desc = button.span.string
+ ids.extend(desc.split(" "))
+ self.id = " ".join(ids)
def cleanup(attr):
attr = attr.strip()
if attr not in ("Download","small",""):
for s in self.subst.get(attr,(attr,)):
yield s
- self.attrs = set(chain.from_iterable(cleanup(attr) for attr in self.id.split(" ")))
+ self.attrs = set(chain.from_iterable(cleanup(attr) for attr in ids))
urls = button.a.attrs
self.torrent = urls["data-bt"]
self.web = urls["data-web"]
- details = dom.dldetails.dlsize
- if details.childs:
- self.size = details.span.data.data
- self.md5 = details.a.attrs["href"]
- else:
- self.size = "Unknown"
- self.md5 = "Unknown"
+ details = soup.find(class_="dldetails").find(class_="dlsize")
+ size = details.find(class_="mbs")
+ md5 = details.find(class_="dlmd5")
+ date = details.find(class_="dldate")
+ self.size = size.string if size else "Unknown"
+ self.md5 = md5.string if md5 else "Unknown"
+ self.date = date.string if date else "Unknown"
def format(self, prefix=""):
res = prefix + '\n'
res += prefix + " " + self.web + "\n"
res += prefix + " " + self.torrent + "\n"
res += prefix + " " + self.size + "\n"
res += prefix + " " + self.md5 + "\n"
+ res += prefix + " " + self.date + "\n"
res += prefix + ""
return res
def __repr__(self):
return self.format()
class Downloads:
- def __init__(self, dom):
- self.id = dom.name[len("downloads "):].split(" ")[0]
+ def __init__(self, soup):
+ self.id = [class_ for class_ in soup["class"] if class_ != "downloads"][0]
self.elements = []
self.others = []
- self.addchilds(dom)
- def addchilds(self, dom):
- for child in dom.childs:
- if child.name.startswith("downloads"):
+ self.addchilds(soup)
+ def addchilds(self, soup):
+ for child in soup.children:
+ if type(child) is not bs4.element.Tag:
+ continue
+ classes = child["class"] if "class" in child.attrs else []
+ if [True for attr in classes if attr in ("arc-toggle", "downloads")]:
self.addchilds(child)
- elif child.name.startswith("download"):
+ elif "download" in classes:
self.elements.append(Download(self.id, child))
- elif child.name == "arc-toggle":
- self.addchilds(child)
- elif child.name in ("clearfix","label"):
+ elif [True for attr in classes if attr in ("clearfix","label")]:
pass
else:
self.others.append(child)
@@ -199,16 +113,19 @@
return self.format()
class Game:
- def __init__(self, dom):
+ def __init__(self, soup):
self.title = "unknown"
self.downloads = []
self.others = []
- for child in dom.childs:
- if child.name == "gameinfo":
- self.title = dom.gameinfo.title.a.data.data.strip()
- elif child.name.startswith("downloads "):
+ for child in soup.children:
+ if type(child) is not bs4.element.Tag:
+ continue
+ classes = child["class"] if "class" in child.attrs else []
+ if "gameinfo" in classes:
+ self.title = child.find(class_="title").a.string.strip()
+ elif "downloads" in classes:
self.downloads.append(Downloads(child))
- elif child.name in ["icn", "clearfix"]:
+ elif [True for attr in classes if attr in ["icn", "clearfix"]]:
pass
else:
self.others.append(child)
@@ -228,15 +145,12 @@
res += ""
return res
-def parseGamesFromDom(dom):
- for row in findRows(dom):
+def parseGamesFromSoup(soup):
+ for row in soup.find_all(class_="row"):
yield Game(row)
def parseGamesFromFile(filename):
- parser = BundleParser()
- for l in open(filename):
- parser.feed(l)
- for game in parseGamesFromDom(parser.dom):
+ for game in parseGamesFromSoup(bs4.BeautifulSoup(open(filename))):
yield game
class FileSelector:
@@ -299,11 +213,12 @@
scores = list(selector(dls))
choosen = selectHighestScore(scores)
for score, dl in scores:
- print("[%s] %2d | %-20s | %-10s | %-25s | %s " % (
+ print("[%s] %2d | %-20s | %-15s | %-10s | %-25s | %s " % (
"*" if dl in choosen else " ",
score,
game.title,
dls.id,
+ dls.date,
", ".join(sorted(dl.attrs)),
dl.torrent))
if dl in choosen: