Take filename from command line arguments, add copyright and better readme text.
#!/usr/bin/python3
#
# Update HIB - Scrapper for the HumbleBundle library page.
# Copyright (C) 2012, Fabien Ninoles <- fabien - AT - tzone . org ->
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from html.parser import HTMLParser
from pprint import pprint
import xml.dom
from itertools import chain
class Node:
def __init__(self, **args):
self.childs = []
self.attrs = {}
for arg in args:
setattr(self, arg, args[arg])
if self.name == "div" and "class" in self.attrs:
self.tag = self.name
self.name = self.attrs["class"]
del self.attrs["class"]
else:
self.tag = self.name
def format(self, prefix = ""):
res = prefix + "<" + self.name
for attr in self.attrs:
if self.attrs[attr]:
res += "\n" + prefix + " " + attr + '="' + self.attrs[attr] + '"'
else:
res += "\n" + prefix + " " + attr
if self.name == "data":
res += ">" + self.data + "</" + self.name + ">"
elif self.childs:
res += ">"
for child in self.childs:
res += "\n" + child.format(prefix + " ")
res += "\n" + prefix + "</" + self.name + ">"
else:
res += "/>"
return res
def find(self, prefix):
for child in self.childs:
if child.name.startswith(prefix):
yield child
def __getattr__(self, name):
for child in self.childs:
if child.name == name:
setattr(self, name, child)
return child
raise AttributeError(name)
def __repr__(self):
return self.format()
class BundleParser(HTMLParser):
def __init__(self, **args):
super(BundleParser, self).__init__(**args)
self.dom = Node(name = "root",
childs = [],
parent = None)
self.current = self.dom
self.depth = 1
def handle_starttag(self, tag, attrs):
# print("+" * self.depth,tag)
new = Node(name = tag,
attrs = dict(attrs),
childs = [],
parent = self.current)
self.current.childs.append(new)
self.current = new
self.depth += 1
def handle_endtag(self, tag):
while tag != self.current.tag:
print("*** Skipping", self.current.tag,"; looking for",tag)
self.current = self.current.parent
self.depth-=1
# print("-" * self.depth,self.current.tag)
assert(self.current != self.dom)
assert(self.current.tag == tag)
self.depth-=1
# print("-" * self.depth,tag)
self.current = self.current.parent
def handle_data(self, data):
if data.strip():
self.current.childs.append(Node(name = "data", data = data, childs = []))
def findRows(dom):
for child in dom.childs:
try:
if child.name[:4] == "row ":
yield child
else:
for row in findRows(child):
yield row
except KeyError:
pass
for row in findRows(child):
yield row
class Download:
subst = { "arc32" : ("x86",),
"arc64" : ("x64",),
"i386.deb" : ("x86","deb"),
"x86_64.deb" : ("x64", "deb"),
"i686.rpm" : ("x86", "rpm"),
".i386.rpm" : ("x86", "rpm"),
"x86_64.rpm" : ("x64", "rpm"),
".x86_64.rpm" : ("x64", "rpm"),
"i386.tar.gz" : ("x86", "tgz"),
"x86_64.tar.gz" : ("x64", "tgz"),
".tar.gz" : ("tgz",),
".deb" : ("deb",),
".rpm" : ("rpm",),
"32-bit" : ("x86",),
"64-bit" : ("x64",),
}
def __init__(self, dltype, dom):
self.dltype = dltype
self.id = dom.name[len("download "):]
button = list(dom.find("flexbtn "))[0]
desc = button.span.data.data
self.id += " " + desc
def cleanup(attr):
attr = attr.strip()
if attr not in ("Download","small",""):
for s in self.subst.get(attr,(attr,)):
yield s
self.attrs = set(chain.from_iterable(cleanup(attr) for attr in self.id.split(" ")))
urls = button.a.attrs
self.torrent = urls["data-bt"]
self.web = urls["data-web"]
details = dom.dldetails.dlsize
if details.childs:
self.size = details.span.data.data
self.md5 = details.a.attrs["href"]
else:
self.size = "Unknown"
self.md5 = "Unknown"
def format(self, prefix=""):
res = prefix + '<download id="' + self.id + '">\n'
res += prefix + " <web>" + self.web + "</web>\n"
res += prefix + " <torrent>" + self.torrent + "</torrent>\n"
res += prefix + " <size>" + self.size + "</size>\n"
res += prefix + " <md5>" + self.md5 + "</md5>\n"
res += prefix + "</download>"
return res
def __repr__(self):
return self.format()
class Downloads:
def __init__(self, dom):
self.id = dom.name[len("downloads "):].split(" ")[0]
self.elements = []
self.others = []
self.addchilds(dom)
def addchilds(self, dom):
for child in dom.childs:
if child.name.startswith("downloads"):
self.addchilds(child)
elif child.name.startswith("download"):
self.elements.append(Download(self.id, child))
elif child.name == "arc-toggle":
self.addchilds(child)
elif child.name in ("clearfix","label"):
pass
else:
self.others.append(child)
def __iter__(self):
return iter(self.elements)
def format(self, prefix = ""):
res = prefix + '<downloads id="' + self.id + '">\n'
if self.elements:
for el in self.elements:
res += el.format(prefix + " ") + "\n"
if self.others:
res += prefix + " <others>\n"
for o in self.others:
res += o.format(prefix + " ") + "\n"
res += prefix + " </others>\n"
res += prefix + "</downloads>"
return res
def __repr__(self):
return self.format()
class Game:
def __init__(self, dom):
self.title = "unknown"
self.downloads = []
self.others = []
for child in dom.childs:
if child.name == "gameinfo":
self.title = dom.gameinfo.title.a.data.data.strip()
elif child.name.startswith("downloads "):
self.downloads.append(Downloads(child))
elif child.name in ["icn", "clearfix"]:
pass
else:
self.others.append(child)
def __repr__(self):
res = "<game>\n"
res += " <title>" + self.title + "</title>\n"
if self.downloads:
res += " <downloads>\n"
for dl in self.downloads:
res += dl.format(" ") + "\n"
res += " </downloads>\n"
if self.others:
res += " <others>\n"
for o in self.others:
res += o.format(" ") + "\n"
res += " </others>\n"
res += "</game>"
return res
def parseGamesFromDom(dom):
for row in findRows(dom):
yield Game(row)
def parseGamesFromFile(filename):
parser = BundleParser()
for l in open(filename):
parser.feed(l)
for game in parseGamesFromDom(parser.dom):
yield game
class FileSelector:
def scoreDownload(self, dl):
if dl.dltype == "audio":
if "FLAC" in dl.attrs:
return 2
if "MP3" in dl.attrs:
return 1
if "website" in dl.attrs:
return -1
raise Exception("Unknow audio type: %r" % (dl.attrs))
if dl.dltype in ("mac","windows"):
return -1
if dl.dltype == "linux":
score = 1
if "x64" in dl.attrs:
score += 1
if "deb" in dl.attrs:
score += 1
return score
if dl.dltype == "android":
return -1
raise Exception("Unknown dls type: %r" % (dl,))
def chooseDownloads(self, dls):
return sorted(((self.scoreDownload(dl),dl) for dl in dls), key=lambda x: x[0], reverse=True)
def __call__(self, dls):
return self.chooseDownloads(dls)
def main(fn):
selector = FileSelector()
downloads = []
for game in parseGamesFromFile(fn):
for dls in game.downloads:
scores = selector(dls)
choosen = list(dl for score, dl in scores if score >= 0)[:1]
for score, dl in scores:
print("[%s] %2d | %-20s | %-10s | %-25s | %s " % (
"*" if dl in choosen else " ",
score,
game.title,
dls.id,
", ".join(sorted(dl.attrs)),
dl.torrent))
if dl in choosen:
downloads.append(dl)
if not scores:
print("No download for",dls.id)
print("-" * 80)
import urllib.request
import urllib.parse
import os
opener = urllib.request.build_opener()
for dl in downloads:
fn = os.path.basename(urllib.parse.urlsplit(dl.torrent).path)
print("Saving",dl.torrent,"as",fn)
with opener.open(dl.torrent) as u:
with open(fn,"wb") as f:
f.write(u.read())
if __name__ == '__main__':
import sys
main(sys.argv[1])