Add torrents.log.
Add support for new Stream section.
Add more logging.
#!/usr/bin/python3
#
# Update HIB - Scrapper for the HumbleBundle library page.
# Copyright (C) 2012, Fabien Ninoles <- fabien - AT - tzone . org ->
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import bs4
from pprint import pprint
from itertools import chain, groupby
import logging
import operator
class Download:
subst = { "arc32" : ("x86",),
"arc64" : ("x64",),
"i386.deb" : ("x86","deb"),
"x86_64.deb" : ("x64", "deb"),
"i686.rpm" : ("x86", "rpm"),
".i386.rpm" : ("x86", "rpm"),
"x86_64.rpm" : ("x64", "rpm"),
".x86_64.rpm" : ("x64", "rpm"),
"i386.tar.gz" : ("x86", "tgz"),
"x86_64.tar.gz" : ("x64", "tgz"),
".tar.gz" : ("tgz",),
".deb" : ("deb",),
".rpm" : ("rpm",),
"32-bit" : ("x86",),
"64-bit" : ("x64",),
"(HD)" : ("HD",),
"(MP3)" : ("MP3",),
}
def __init__(self, dltype, soup):
self.dltype = dltype
ids = [attr for attr in soup["class"] if attr != "download"]
button = soup.find(class_="flexbtn")
desc = button.span.string
ids.extend(desc.split(" "))
self.id = " ".join(ids)
def cleanup(attr):
attr = attr.strip()
if attr not in ("Download","small",""):
for s in self.subst.get(attr,(attr,)):
yield s
self.attrs = set(chain.from_iterable(cleanup(attr) for attr in ids))
urls = button.a.attrs
logging.debug("URLS are %r", urls)
self.torrent = urls["data-bt"]
self.web = urls["data-web"]
details = soup.find(class_="dldetails").find(class_="dlsize")
size = details.find(class_="mbs")
md5 = details.find(class_="dlmd5")
date = details.find(class_="dldate")
self.size = size.string if size else "Unknown"
self.md5 = md5.string if md5 else "Unknown"
self.date = date.string if date else "Unknown"
def format(self, prefix=""):
res = prefix + '<download type="' + self.dltype + '" id="' + self.id + '">\n'
res += prefix + " <web>" + self.web + "</web>\n"
res += prefix + " <torrent>" + self.torrent + "</torrent>\n"
res += prefix + " <size>" + self.size + "</size>\n"
res += prefix + " <md5>" + self.md5 + "</md5>\n"
res += prefix + " <date>" + self.date + "</date>\n"
res += prefix + "</download>"
return res
def __repr__(self):
return self.format()
class Downloads:
def __init__(self, soup):
self.id = [class_ for class_ in soup["class"] if class_ != "downloads"][0]
self.elements = []
self.others = []
self.addchilds(soup)
def addchilds(self, soup):
logging.debug("Parsing soup for downloads %s", self.id)
for child in soup.children:
if type(child) is not bs4.element.Tag:
continue
classes = child["class"] if "class" in child.attrs else []
if [True for attr in classes if attr in ("arc-toggle", "downloads")]:
self.addchilds(child)
elif "download" in classes:
desc = child.find(class_="flexbtn").span.string
if desc == "Stream":
logging.info("Ignoring Stream URLs for %s", self.id)
else:
self.elements.append(Download(self.id, child))
elif [True for attr in classes if attr in ("clearfix","label")]:
pass
else:
self.others.append(child)
def __iter__(self):
return iter(self.elements)
def format(self, prefix = ""):
res = prefix + '<downloads id="' + self.id + '">\n'
if self.elements:
for el in self.elements:
res += el.format(prefix + " ") + "\n"
if self.others:
res += prefix + " <others>\n"
for o in self.others:
res += o.format(prefix + " ") + "\n"
res += prefix + " </others>\n"
res += prefix + "</downloads>"
return res
def __repr__(self):
return self.format()
class Game:
def __init__(self, soup):
self.title = "unknown"
self.downloads = []
self.others = []
for child in soup.children:
if type(child) is not bs4.element.Tag:
continue
classes = child["class"] if "class" in child.attrs else []
if "gameinfo" in classes:
self.title = child.find(class_="title").a.string.strip()
elif "downloads" in classes:
logging.debug("Collecting downloadables for %s", self.title)
self.downloads.append(Downloads(child))
elif [True for attr in classes if attr in ["icn", "clearfix"]]:
pass
else:
self.others.append(child)
def __repr__(self):
res = "<game>\n"
res += " <title>" + self.title + "</title>\n"
if self.downloads:
res += " <downloads>\n"
for dl in self.downloads:
res += dl.format(" ") + "\n"
res += " </downloads>\n"
if self.others:
res += " <others>\n"
for o in self.others:
res += o.format(" ") + "\n"
res += " </others>\n"
res += "</game>"
return res
def parseGamesFromSoup(soup):
for row in soup.find_all(class_="row"):
yield Game(row)
def parseGamesFromFile(filename):
for game in parseGamesFromSoup(bs4.BeautifulSoup(open(filename))):
yield game
class FileSelector:
def scoreDownload(self, dl):
if dl.dltype == "audio":
if not dl.attrs: # Empty set, so we simply take it.
return 1
if "FLAC" in dl.attrs:
return 1
if "OGG" in dl.attrs:
return 1
if "MP3" in dl.attrs:
return 1
if "website" in dl.attrs:
return -1
if "AAC" in dl.attrs:
return 1
raise Exception("Unknown audio type: %r" % (dl.attrs))
if dl.dltype in ("mac","windows"):
return -1
if dl.dltype == "linux":
score = 1
if "x64" in dl.attrs:
score += 2
if "deb" in dl.attrs:
score += 1
if "Stream" in dl.attrs:
score -= 1
return score
if dl.dltype == "android":
return -1
if dl.dltype == "ebook":
if "MOBI" in dl.attrs:
return -1
if "HD" in dl.attrs:
return 2
return 1
raise Exception("Unknown dls type: %r" % (dl,))
def chooseDownloads(self, dls):
return sorted(((self.scoreDownload(dl),dl) for dl in dls), key=lambda x: x[0], reverse=True)
def __call__(self, dls):
return self.chooseDownloads(dls)
def selectHighestScore(scores):
if scores:
get_first = operator.itemgetter(0)
score, dls = next(groupby(sorted(scores, key = get_first, reverse=True), get_first))
if score > 0:
return list(dl for s, dl in dls)
else:
return []
logging.debug("Empty scores list: %r", scores)
return []
class tee:
def __init__(self, main, *other):
self.main = main
self.other = other
def write(self, s):
self.main.write(s)
for o in self.other:
o.write(s)
def main(fn):
selector = FileSelector()
downloads = []
import sys
with open("torrents.log", "w") as l:
for game in parseGamesFromFile(fn):
logging.info("Parsing game %s (%d downloads)", game.title, len(game.downloads))
for dls in game.downloads:
scores = list(selector(dls))
choosen = selectHighestScore(scores)
for score, dl in scores:
print("[%s] %2d | %-30s | %-15s | %-30s | %-15s | %s " % (
"*" if dl in choosen else " ",
score,
game.title,
dls.id,
dl.date,
", ".join(sorted(dl.attrs)),
dl.torrent),
file=l)
if dl in choosen:
downloads.append(dl)
if not scores:
print("No download for %s" % (dls.id), file=l)
print("-" * 80, file=l)
import urllib.request
import urllib.parse
import os
urlfile = open('http-download.sh','w')
opener = urllib.request.build_opener()
for dl in (dl for dl in downloads):
if dl.torrent:
try:
fn = os.path.basename(urllib.parse.urlsplit(dl.torrent).path)
if os.path.exists(fn):
logging.info("Skipping existing torrent %s", fn)
else:
logging.info("Saving %s as %s", dl.torrent, fn)
with opener.open(dl.torrent) as u:
with open(fn,"wb") as f:
f.write(u.read())
logging.info("%s saved.", os.path.realpath(fn))
except:
logging.exception("Error with download %r", dl)
else:
logging.info("No torrent, url is %s", dl.web)
fn = os.path.basename(urllib.parse.urlsplit(dl.web).path)
urlfile.write("wget --progress=bar -c -O %s \"%s\"\n" % (fn,dl.web))
if __name__ == '__main__':
import sys
logging.getLogger().setLevel(logging.INFO)
main(sys.argv[1])