#!/usr/bin/python # This script is (c) 2009 Michael Wojciechowski. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # Find out if any new movie torrents with high leecher count are # present on the pirate bay. # # Requirements: python 2.5.x, sqlite3. # # Run this script from cron every half hour or so. import sgmllib import re import urllib import os from pysqlite2 import dbapi2 as sqlite3 SITEROOT="http://thepiratebay.org" class urlList: "List of URLs and their age." def __init__(self, filename): self.filename = filename self.conn = sqlite3.connect(filename) self.curr = self.conn.cursor() self.printDebug = False def initDb(self): dbPresent = False try: self.curr.execute('select * from tpbg') dbPresent = True except sqlite3.OperationalError: dbPresent = False if (dbPresent == False): self.curr.execute('''create table tpbg (url text, descr text, age integer)''') if (self.printDebug): print "Created db." else: if (self.printDebug): print "Using existing db." def add(self, value, descr_link): #print "Adding "+str(value) t = (value, descr_link) self.curr.execute('select * from tpbg where url=? AND descr=?', t) row = self.curr.fetchone() if (row == None): # Add. t=(value, descr_link) self.curr.execute('insert into tpbg values (?,?,0)', t) if (self.printDebug): print "adding a new url." else: # reset age. newage=0 t=(value, descr_link) self.curr.execute('update tpbg set age = 1 where url=? AND descr=?', t) if (self.printDebug): print "updating url age." def getNew(self): count=1 self.curr.execute('select * from tpbg where age=0') for row in self.curr: print "Found new torrent #" + str(count) + "." print "URL : " + str(row[0]) print "Description: " + SITEROOT + str(row[1]) print "" count += 1 def incAge(self): self.curr.execute('update tpbg set age = age + 1') def removeOld(self): self.curr.execute('delete from tpbg where age > 100') def close(self): """ Close database connection and write cached data to disk. """ self.conn.commit() self.curr.close() class MyParser(sgmllib.SGMLParser): "HTML parser class." def __init__(self, dbFile, verbose=0): "Initialise an object, passing 'verbose' to the superclass." self.ul = urlList(dbFile) self.printDebug = False self.lastLink = "" self.ul.initDb() sgmllib.SGMLParser.__init__(self, verbose) # Only want URLs which are torrent files. self.urlwantregex = re.compile("^http://.*\.torrent$") # This regex is used to remove torrent files in languages which I am not interrested in. self.langregex = re.compile("italian|italia|spanish|hindi", re.IGNORECASE) def parse(self, s): "Parse the given string 's'." self.feed(s) self.close() def start_a(self, attributes): "Process a hyperlink and its 'attributes'." for name, value in attributes: if name == "href": self.add_hyperlink(value) self.lastLink = value def add_hyperlink(self, value): match = self.langregex.search(value) if (match): return match = self.urlwantregex.search(value) if (match): self.ul.add(value, self.lastLink) if (self.printDebug): print "adding " + value print "adding (2) " + self.lastLink self.lastLink = "" def getNewLinks(self): self.ul.getNew() def save(self): self.ul.incAge() self.ul.close() if __name__ == "__main__": dbFile = os.environ['HOME']+"/.tpdbv2" myparser = MyParser(dbFile) for i in range(5): url = SITEROOT + "/browse/201/"+str(i)+"/9" # print "Parsing URL: " + url f = urllib.urlopen(url) s = f.read() myparser.parse(s) myparser.getNewLinks() myparser.save()