update-hib.py
author Fabien Ninoles <fabien@tzone.org>
Sun, 27 Jul 2014 21:07:54 -0400
changeset 13 7567c5e4db45
parent 12 9d5880ecdb82
child 14 f7112a0f9df7
permissions -rwxr-xr-x
Add a "cache-dir" option to the script.

#!/usr/bin/python3
#
# Update HIB - Scrapper for the HumbleBundle library page.
# Copyright (C) 2012, Fabien Ninoles <- fabien - AT - tzone . org ->
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import bs4
from pprint import pprint
from itertools import chain, groupby
import logging
import operator

class Download:
    subst = { "arc32"         : ("x86",),
              "arc64"         : ("x64",),
              "i386.deb"      : ("x86","deb"),
              "x86_64.deb"    : ("x64", "deb"),
              "i686.rpm"      : ("x86", "rpm"),
              ".i386.rpm"     : ("x86", "rpm"),
              "x86_64.rpm"    : ("x64", "rpm"),
              ".x86_64.rpm"   : ("x64", "rpm"),
              "i386.tar.gz"   : ("x86", "tgz"),
              "x86_64.tar.gz" : ("x64", "tgz"),
              ".tar.gz"       : ("tgz",),
              ".deb"          : ("deb",),
              ".rpm"          : ("rpm",),
              "32-bit"        : ("x86",),
              "64-bit"        : ("x64",),
              "(HD)"          : ("HD",),
              "(MP3)"         : ("MP3",),
              }
    def __init__(self, dltype, soup):
        self.dltype = dltype
        ids = [attr for attr in soup["class"] if attr != "download"]
        button = soup.find(class_="flexbtn")
        desc = button.span.string
        ids.extend(desc.split(" "))
        self.id = " ".join(ids)
        def cleanup(attr):
            attr = attr.strip()
            if attr not in ("Download","small",""):
                for s in self.subst.get(attr,(attr,)):
                    yield s
        self.attrs = set(chain.from_iterable(cleanup(attr) for attr in ids))
        urls = button.a.attrs
        logging.debug("URLS are %r", urls)
        self.torrent = urls["data-bt"] if "data-bt" in urls.keys() else None
        self.web = urls["data-web"]
        details = soup.find(class_="dldetails").find(class_="dlsize")
        size = details.find(class_="mbs")
        md5 = details.find(class_="dlmd5")
        date = details.find(class_="dldate")
        self.size = size.string if size else "Unknown"
        self.md5 = md5.string if md5 else "Unknown"
        self.date = date.string if date else "Unknown"
    def format(self, prefix=""):
        res = prefix + '<download type="' + self.dltype + '" id="' + self.id + '">\n'
        res += prefix + "  <web>" + self.web + "</web>\n"
        res += prefix + "  <torrent>" + self.torrent + "</torrent>\n"
        res += prefix + "  <size>" + self.size + "</size>\n"
        res += prefix + "  <md5>" + self.md5 + "</md5>\n"
        res += prefix + "  <date>" + self.date + "</date>\n"
        res += prefix + "</download>"
        return res
    def __repr__(self):
        return self.format()

class Downloads:
    def __init__(self, soup):
        self.id = [class_ for class_ in soup["class"] if class_ not in ("downloads","js-platform")][0]
        self.elements = []
        self.others = []
        self.addchilds(soup)
    def addchilds(self, soup):
        logging.debug("Parsing soup for downloads %s", self.id)
        for child in soup.children:
            if type(child) is not bs4.element.Tag:
                continue
            classes = child["class"] if "class" in child.attrs else []
            if [True for attr in classes if attr in ("arc-toggle", "downloads")]:
                self.addchilds(child)
            elif "download-buttons" in classes:
                for subchild in child.children:
                    if type(subchild) is not bs4.element.Tag:
                        continue
                    btn = subchild.find(class_="flexbtn")
                    if not btn:
                        continue
                    desc = btn.span.string
                    if desc == "Stream":
                        logging.info("Ignoring Stream URLs for %s", self.id)
                    else:
                        self.elements.append(Download(self.id, subchild))
            elif [True for attr in classes if attr in ("clearfix","label")]:
                pass
            else:
                self.others.append(child)
    def __iter__(self):
        return iter(self.elements)
    def format(self, prefix = ""):
        res = prefix + '<downloads id="' + self.id + '">\n'
        if self.elements:
            for el in self.elements:
                res += el.format(prefix + "  ") + "\n"
        if self.others:
            res += prefix + "  <others>\n"
            for o in self.others:
                res += o.format(prefix + "    ") + "\n"
            res += prefix + "  </others>\n"
        res += prefix + "</downloads>"
        return res
    def __repr__(self):
        return self.format()

class Game:
    def __init__(self, soup):
        self.title = "unknown"
        self.downloads = []
        self.others = []
        for child in soup.children:
            if type(child) is not bs4.element.Tag:
                continue
            classes = child["class"] if "class" in child.attrs else []
            if "gameinfo" in classes:
                divTitle = child.find(class_="title")
                if divTitle.a:
                    divTitle = divTitle.a
                self.title = divTitle.string.strip()
            elif "downloads" in classes:
                logging.debug("Collecting downloadables for %s", self.title)
                self.downloads.append(Downloads(child))
            elif [True for attr in classes if attr in ["icn", "clearfix"]]:
                pass
            else:
                self.others.append(child)
    def __repr__(self):
        res  = "<game>\n"
        res += "  <title>" + self.title + "</title>\n"
        if self.downloads:
            res += "  <downloads>\n"
            for dl in self.downloads:
                res += dl.format("    ") + "\n"
            res += "  </downloads>\n"
        if self.others:
            res += "  <others>\n"
            for o in self.others:
                res += o.format("    ") + "\n"
            res += "  </others>\n"
        res += "</game>"
        return res

def parseGamesFromSoup(soup):
    for row in soup.find_all(class_="row"):
        yield Game(row)

def parseGamesFromFile(filename):
    for game in parseGamesFromSoup(bs4.BeautifulSoup(open(filename))):
        yield game

class FileSelector:
    def scoreDownload(self, dl):
        if dl.dltype == "audio":
            if not dl.attrs: # Empty set, so we simply take it.
                return 1
            if "FLAC" in dl.attrs:
                return 1
            if "OGG" in dl.attrs:
                return 1
            if "MP3" in dl.attrs:
                return 1
            if "website" in dl.attrs:
                return -1
            if "AAC" in dl.attrs:
                return 1
            raise Exception("Unknown audio type: %r" % (dl.attrs))
        if dl.dltype in ("mac","windows"):
            return -1
        if dl.dltype == "linux":
            score = 1
            if "x64" in dl.attrs:
                score += 2
            if "deb" in dl.attrs:
                score += 1
            if "Stream" in dl.attrs:
                score -= 1
            return score
        if dl.dltype == "android":
            return -1
        if dl.dltype == "ebook":
            if "MOBI" in dl.attrs:
                return -1
            return 1
        raise Exception("Unknown dls type: %r" % (dl,))

    def chooseDownloads(self, dls):
        return sorted(((self.scoreDownload(dl),dl) for dl in dls), key=lambda x: x[0], reverse=True)

    def __call__(self, dls):
        return self.chooseDownloads(dls)

def selectHighestScore(scores):
    if scores:
        get_first = operator.itemgetter(0)
        score, dls = next(groupby(sorted(scores, key = get_first, reverse=True), get_first))
        if score > 0:
            return list(dl for s, dl in dls)
        else:
            return []
    logging.debug("Empty scores list: %r", scores)
    return []

class tee:
    def __init__(self, main, *other):
        self.main = main
        self.other = other
    def write(self, s):
        self.main.write(s)
        for o in self.other:
            o.write(s)

def main(fn, cachedir):
    selector = FileSelector()
    downloads = []
    import sys
    import os
    import urllib.parse
    with open("torrents.log", "w") as l:
        for game in parseGamesFromFile(fn):
            logging.info("Parsing game %s (%d downloads)", game.title, len(game.downloads))
            for dls in game.downloads:
                scores = list(selector(dls))
                choosen = selectHighestScore(scores)
                for score, dl in scores:
                    print("[%s] %2d | %-30s | %-15s | %-30s | %-15s | %s <%s>" % (
                            "*" if dl in choosen else " ",
                            score,
                            game.title,
                            dls.id,
                            dl.date,
                            ", ".join(sorted(dl.attrs)),
			    os.path.basename(urllib.parse.urlsplit(dl.torrent).path),
                            dl.torrent),
                          file=l)
                    if dl in choosen:
                        downloads.append(dl)
                if not scores:
                    print("No download for %s" % (dls.id), file=l)
                print("-" * 80, file=l)

    import urllib.request
    urlfile = open('http-download.sh','w')
    opener = urllib.request.build_opener()
    cache = set(os.listdir(cachedir))
    for dl in (dl for dl in downloads):
        if dl.torrent:
            try:
                fn = os.path.basename(urllib.parse.urlsplit(dl.torrent).path)
                if os.path.exists(fn):
                    logging.info("Skipping existing torrent %s", fn)
                elif fn in cache:
                    logging.info("Copying %s as %s from cache", dl.torrent, fn)
                    os.link(os.path.join(cachedir, fn), fn)
                else:
                    logging.info("Saving %s as %s", dl.torrent, fn)
                    with opener.open(dl.torrent) as u:
                        with open(fn,"wb") as f:
                            f.write(u.read())
                    logging.info("%s saved.", os.path.realpath(fn))
            except:
                logging.exception("Error with download %r", dl)
        else:
            logging.info("No torrent, url is %s", dl.web)
            fn = os.path.basename(urllib.parse.urlsplit(dl.web).path)
            urlfile.write("wget --progress=bar -c -O %s \"%s\"\n" % (fn,dl.web))



if __name__ == '__main__':
    import sys
    logging.getLogger().setLevel(logging.INFO)
    main(sys.argv[1], sys.argv[2])