#!/usr/bin/python3## Update HIB - Scrapper for the HumbleBundle library page.# Copyright (C) 2012, Fabien Ninoles <- fabien - AT - tzone . org ->## This program is free software: you can redistribute it and/or modify# it under the terms of the GNU General Public License as published by# the Free Software Foundation, either version 3 of the License, or# (at your option) any later version.## This program is distributed in the hope that it will be useful,# but WITHOUT ANY WARRANTY; without even the implied warranty of# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the# GNU General Public License for more details.## You should have received a copy of the GNU General Public License# along with this program. If not, see <http://www.gnu.org/licenses/>.import bs4from pprint import pprintfrom itertools import chain, groupbyimport loggingimport operatorclass Download: subst = { "arc32" : ("x86",), "arc64" : ("x64",), "i386.deb" : ("x86","deb"), "x86_64.deb" : ("x64", "deb"), "i686.rpm" : ("x86", "rpm"), ".i386.rpm" : ("x86", "rpm"), "x86_64.rpm" : ("x64", "rpm"), ".x86_64.rpm" : ("x64", "rpm"), "i386.tar.gz" : ("x86", "tgz"), "x86_64.tar.gz" : ("x64", "tgz"), ".tar.gz" : ("tgz",), ".deb" : ("deb",), ".rpm" : ("rpm",), "32-bit" : ("x86",), "64-bit" : ("x64",), "(HD)" : ("HD",), "(MP3)" : ("MP3",), } def __init__(self, dltype, soup): self.dltype = dltype ids = [attr for attr in soup["class"] if attr != "download"] button = soup.find(class_="flexbtn") desc = button.span.string ids.extend(desc.split(" ")) self.id = " ".join(ids) def cleanup(attr): attr = attr.strip() if attr not in ("Download","small",""): for s in self.subst.get(attr,(attr,)): yield s self.attrs = set(chain.from_iterable(cleanup(attr) for attr in ids)) urls = button.a.attrs logging.debug("URLS are %r", urls) self.torrent = urls["data-bt"] if "data-bt" in urls.keys() else None self.web = urls["data-web"] details = soup.find(class_="dldetails").find(class_="dlsize") size = details.find(class_="mbs") md5 = details.find(class_="dlmd5") date = details.find(class_="dldate") self.size = size.string if size else "Unknown" self.md5 = md5.string if md5 else "Unknown" self.date = date.string if date else "Unknown" def format(self, prefix=""): res = prefix + '<download type="' + self.dltype + '" id="' + self.id + '">\n' res += prefix + " <web>" + self.web + "</web>\n" res += prefix + " <torrent>" + self.torrent + "</torrent>\n" res += prefix + " <size>" + self.size + "</size>\n" res += prefix + " <md5>" + self.md5 + "</md5>\n" res += prefix + " <date>" + self.date + "</date>\n" res += prefix + "</download>" return res def __repr__(self): return self.format()class Downloads: def __init__(self, soup): self.id = [class_ for class_ in soup["class"] if class_ not in ("downloads","js-platform")][0] self.elements = [] self.others = [] self.addchilds(soup) def addchilds(self, soup): logging.debug("Parsing soup for downloads %s", self.id) for child in soup.children: if type(child) is not bs4.element.Tag: continue classes = child["class"] if "class" in child.attrs else [] if [True for attr in classes if attr in ("arc-toggle", "downloads")]: self.addchilds(child) elif "download-buttons" in classes: for subchild in child.children: if type(subchild) is not bs4.element.Tag: continue btn = subchild.find(class_="flexbtn") if not btn: continue desc = btn.span.string if desc == "Stream": logging.info("Ignoring Stream URLs for %s", self.id) else: self.elements.append(Download(self.id, subchild)) elif [True for attr in classes if attr in ("clearfix","label")]: pass else: self.others.append(child) def __iter__(self): return iter(self.elements) def format(self, prefix = ""): res = prefix + '<downloads id="' + self.id + '">\n' if self.elements: for el in self.elements: res += el.format(prefix + " ") + "\n" if self.others: res += prefix + " <others>\n" for o in self.others: res += o.format(prefix + " ") + "\n" res += prefix + " </others>\n" res += prefix + "</downloads>" return res def __repr__(self): return self.format()class Game: def __init__(self, soup): self.title = "unknown" self.downloads = [] self.others = [] for child in soup.children: if type(child) is not bs4.element.Tag: continue classes = child["class"] if "class" in child.attrs else [] if "gameinfo" in classes: self.title = child.find(class_="title").a.string.strip() elif "downloads" in classes: logging.debug("Collecting downloadables for %s", self.title) self.downloads.append(Downloads(child)) elif [True for attr in classes if attr in ["icn", "clearfix"]]: pass else: self.others.append(child) def __repr__(self): res = "<game>\n" res += " <title>" + self.title + "</title>\n" if self.downloads: res += " <downloads>\n" for dl in self.downloads: res += dl.format(" ") + "\n" res += " </downloads>\n" if self.others: res += " <others>\n" for o in self.others: res += o.format(" ") + "\n" res += " </others>\n" res += "</game>" return resdef parseGamesFromSoup(soup): for row in soup.find_all(class_="row"): yield Game(row)def parseGamesFromFile(filename): for game in parseGamesFromSoup(bs4.BeautifulSoup(open(filename))): yield gameclass FileSelector: def scoreDownload(self, dl): if dl.dltype == "audio": if not dl.attrs: # Empty set, so we simply take it. return 1 if "FLAC" in dl.attrs: return 1 if "OGG" in dl.attrs: return 1 if "MP3" in dl.attrs: return 1 if "website" in dl.attrs: return -1 if "AAC" in dl.attrs: return 1 raise Exception("Unknown audio type: %r" % (dl.attrs)) if dl.dltype in ("mac","windows"): return -1 if dl.dltype == "linux": score = 1 if "x64" in dl.attrs: score += 2 if "deb" in dl.attrs: score += 1 if "Stream" in dl.attrs: score -= 1 return score if dl.dltype == "android": return -1 if dl.dltype == "ebook": if "MOBI" in dl.attrs: return -1 if "HD" in dl.attrs: return 2 return 1 raise Exception("Unknown dls type: %r" % (dl,)) def chooseDownloads(self, dls): return sorted(((self.scoreDownload(dl),dl) for dl in dls), key=lambda x: x[0], reverse=True) def __call__(self, dls): return self.chooseDownloads(dls)def selectHighestScore(scores): if scores: get_first = operator.itemgetter(0) score, dls = next(groupby(sorted(scores, key = get_first, reverse=True), get_first)) if score > 0: return list(dl for s, dl in dls) else: return [] logging.debug("Empty scores list: %r", scores) return []class tee: def __init__(self, main, *other): self.main = main self.other = other def write(self, s): self.main.write(s) for o in self.other: o.write(s)def main(fn): selector = FileSelector() downloads = [] import sys import os import urllib.parse with open("torrents.log", "w") as l: for game in parseGamesFromFile(fn): logging.info("Parsing game %s (%d downloads)", game.title, len(game.downloads)) for dls in game.downloads: scores = list(selector(dls)) choosen = selectHighestScore(scores) for score, dl in scores: print("[%s] %2d | %-30s | %-15s | %-30s | %-15s | %s <%s>" % ( "*" if dl in choosen else " ", score, game.title, dls.id, dl.date, ", ".join(sorted(dl.attrs)), os.path.basename(urllib.parse.urlsplit(dl.torrent).path), dl.torrent), file=l) if dl in choosen: downloads.append(dl) if not scores: print("No download for %s" % (dls.id), file=l) print("-" * 80, file=l) import urllib.request urlfile = open('http-download.sh','w') opener = urllib.request.build_opener() for dl in (dl for dl in downloads): if dl.torrent: try: fn = os.path.basename(urllib.parse.urlsplit(dl.torrent).path) if os.path.exists(fn): logging.info("Skipping existing torrent %s", fn) else: logging.info("Saving %s as %s", dl.torrent, fn) with opener.open(dl.torrent) as u: with open(fn,"wb") as f: f.write(u.read()) logging.info("%s saved.", os.path.realpath(fn)) except: logging.exception("Error with download %r", dl) else: logging.info("No torrent, url is %s", dl.web) fn = os.path.basename(urllib.parse.urlsplit(dl.web).path) urlfile.write("wget --progress=bar -c -O %s \"%s\"\n" % (fn,dl.web))if __name__ == '__main__': import sys logging.getLogger().setLevel(logging.INFO) main(sys.argv[1])