Initial version: parse tidy file and select a suitable download url.
#!/usr/bin/python3
from html.parser import HTMLParser
from pprint import pprint
import xml.dom
from itertools import chain
class Node:
def __init__(self, **args):
self.childs = []
self.attrs = {}
for arg in args:
setattr(self, arg, args[arg])
if self.name == "div" and "class" in self.attrs:
self.tag = self.name
self.name = self.attrs["class"]
del self.attrs["class"]
else:
self.tag = self.name
def format(self, prefix = ""):
res = prefix + "<" + self.name
for attr in self.attrs:
if self.attrs[attr]:
res += "\n" + prefix + " " + attr + '="' + self.attrs[attr] + '"'
else:
res += "\n" + prefix + " " + attr
if self.name == "data":
res += ">" + self.data + "</" + self.name + ">"
elif self.childs:
res += ">"
for child in self.childs:
res += "\n" + child.format(prefix + " ")
res += "\n" + prefix + "</" + self.name + ">"
else:
res += "/>"
return res
def find(self, prefix):
for child in self.childs:
if child.name.startswith(prefix):
yield child
def __getattr__(self, name):
for child in self.childs:
if child.name == name:
setattr(self, name, child)
return child
raise AttributeError(name)
def __repr__(self):
return self.format()
class BundleParser(HTMLParser):
def __init__(self, **args):
super(BundleParser, self).__init__(**args)
self.dom = Node(name = "root",
childs = [],
parent = None)
self.current = self.dom
self.depth = 1
def handle_starttag(self, tag, attrs):
# print("+" * self.depth,tag)
new = Node(name = tag,
attrs = dict(attrs),
childs = [],
parent = self.current)
self.current.childs.append(new)
self.current = new
self.depth += 1
def handle_endtag(self, tag):
while tag != self.current.tag:
print("*** Skipping", self.current.tag,"; looking for",tag)
self.current = self.current.parent
self.depth-=1
# print("-" * self.depth,self.current.tag)
assert(self.current != self.dom)
assert(self.current.tag == tag)
self.depth-=1
# print("-" * self.depth,tag)
self.current = self.current.parent
def handle_data(self, data):
if data.strip():
self.current.childs.append(Node(name = "data", data = data, childs = []))
def findRows(dom):
for child in dom.childs:
try:
if child.name[:4] == "row ":
yield child
else:
for row in findRows(child):
yield row
except KeyError:
pass
for row in findRows(child):
yield row
class Download:
subst = { "arc32" : ("x86",),
"arc64" : ("x64",),
"i386.deb" : ("x86","deb"),
"x86_64.deb" : ("x64", "deb"),
"i686.rpm" : ("x86", "rpm"),
".i386.rpm" : ("x86", "rpm"),
"x86_64.rpm" : ("x64", "rpm"),
".x86_64.rpm" : ("x64", "rpm"),
"i386.tar.gz" : ("x86", "tgz"),
"x86_64.tar.gz" : ("x64", "tgz"),
".tar.gz" : ("tgz",),
".deb" : ("deb",),
".rpm" : ("rpm",),
"32-bit" : ("x86",),
"64-bit" : ("x64",),
}
def __init__(self, dltype, dom):
self.dltype = dltype
self.id = dom.name[len("download "):]
button = list(dom.find("flexbtn "))[0]
desc = button.span.data.data
self.id += " " + desc
def cleanup(attr):
attr = attr.strip()
if attr not in ("Download","small",""):
for s in self.subst.get(attr,(attr,)):
yield s
self.attrs = set(chain.from_iterable(cleanup(attr) for attr in self.id.split(" ")))
urls = button.a.attrs
self.torrent = urls["data-bt"]
self.web = urls["data-web"]
details = dom.dldetails.dlsize
if details.childs:
self.size = details.span.data.data
self.md5 = details.a.attrs["href"]
else:
self.size = "Unknown"
self.md5 = "Unknown"
def score(self):
if self.dltype == "audio":
if "FLAC" in self.attrs:
return 2
if "MP3" in self.attrs:
return 1
if "website" in self.attrs:
return -1
raise Exception("Unknow audio type: %r" % (self.attrs))
if self.dltype in ("mac","windows"):
return -1
if self.dltype == "linux":
score = 1
if "x64" in self.attrs:
score += 1
if "deb" in self.attrs:
score += 1
return score
if self.dltype == "android":
return 0
raise Exception("Unknown dls type: %r" % (self,))
def format(self, prefix=""):
res = prefix + '<download id="' + self.id + '">\n'
res += prefix + " <web>" + self.web + "</web>\n"
res += prefix + " <torrent>" + self.torrent + "</torrent>\n"
res += prefix + " <size>" + self.size + "</size>\n"
res += prefix + " <md5>" + self.md5 + "</md5>\n"
res += prefix + "</download>"
return res
def __repr__(self):
return self.format()
class Downloads:
def __init__(self, dom):
self.id = dom.name[len("downloads "):].split(" ")[0]
self.elements = []
self.others = []
self.addchilds(dom)
def addchilds(self, dom):
for child in dom.childs:
if child.name.startswith("downloads"):
self.addchilds(child)
elif child.name.startswith("download"):
self.elements.append(Download(self.id, child))
elif child.name == "arc-toggle":
self.addchilds(child)
elif child.name in ("clearfix","label"):
pass
else:
self.others.append(child)
def __iter__(self):
return iter(self.elements)
def format(self, prefix = ""):
res = prefix + '<downloads id="' + self.id + '">\n'
if self.elements:
for el in self.elements:
res += el.format(prefix + " ") + "\n"
if self.others:
res += prefix + " <others>\n"
for o in self.others:
res += o.format(prefix + " ") + "\n"
res += prefix + " </others>\n"
res += prefix + "</downloads>"
return res
def choose(self):
scores = list((dl.score(),dl) for dl in self if dl.score() >= 0)
scores.sort(key = lambda x: x[0], reverse = True)
for s, dl in scores:
return [dl]
return []
def __repr__(self):
return self.format()
class Game:
def __init__(self, dom):
self.title = "unknown"
self.downloads = []
self.others = []
for child in dom.childs:
if child.name == "gameinfo":
self.title = dom.gameinfo.title.a.data.data.strip()
elif child.name.startswith("downloads "):
self.downloads.append(Downloads(child))
elif child.name in ["icn", "clearfix"]:
pass
else:
self.others.append(child)
def __repr__(self):
res = "<game>\n"
res += " <title>" + self.title + "</title>\n"
if self.downloads:
res += " <downloads>\n"
for dl in self.downloads:
res += dl.format(" ") + "\n"
res += " </downloads>\n"
if self.others:
res += " <others>\n"
for o in self.others:
res += o.format(" ") + "\n"
res += " </others>\n"
res += "</game>"
return res
def parseGames(dom):
for row in findRows(dom):
yield Game(row)
parser = BundleParser()
with open("tidy_bundle.html") as f:
for l in f:
parser.feed(l)
for game in parseGames(parser.dom):
for dls in game.downloads:
choosen = dls.choose()
for dl in dls:
print("%s | %-20s | %-10s | %-25s | %s " % (
"*" if dl in choosen else " ",
game.title,
dls.id,
", ".join(sorted(dl.attrs)),
dl.torrent))