Threads in python only let you “stack” the network delays.
No interface, simply edit the file to get what you wanted, quick and dirty.
Hasn’t been used in over a year many years.
#!/usr/bin/env python
# Scans Craigslist city/category RSS pages and saves new data to SQLite
# (file:dbfile, table:cl)
# 2014/10/21
# 2014/11/01 - added concurrent threads, moved to project folder, added urllib_retry
# 2016-09-14 - updated, formattting, git tracking, getting ready to use in portfolio
"""Logs all posts in a given city and category to SQLite"""
import cookielib
from cookielib import CookieJar
from datetime import datetime
try: import Queue
except: import queue as Queue
import random # to randomly stagger threads for better output
import re
import socket # for timeout/retry
import sqlite3
import string
import thread
import threading
import time
import urllib2
import urllib_retry # local module
# browser setup
cj = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
opener.addheaders = [('User-agent','Mozilla/5.0')] # fake header
# table setup
#connection = sqlite3.Connection("dbfile", detect_types=sqlite3.PARSE_COLNAMES)
#c = connection.cursor()
# if the cl table exists, drop it (makes new table each run for testing)
#c.execute("DROP TABLE IF EXISTS cl")
#c.execute("CREATE TABLE cl (Id INTEGER PRIMARY KEY NOT NULL,
# Scanned TEXT,
# Posted TEXT,
# Links TEXT,
# Titles TEXT,
# Bodies TEXT)") # make a new cl table
# enter city(s) and category(s)
cities = ['charleston','flagstaff','dallas','boston','miami','lasvegas']
#,'madison','pittsburgh','chicago','austin','neworleans']
#,'atlanta','newyork','seattle']
categories = ['rid','act']
q = Queue.Queue()
alive_threads = 0 # track how many threads are alive
lock = thread.allocate_lock() # create a lock object
pages_scanned = 0
class worker_thread(threading.Thread):
def __init__(self, city, category):
global alive_threads, pages_scanned
threading.Thread.__init__(self)
self.city = city
self.category = category
lock.acquire()
alive_threads+=1 # increment under lock
lock.release()
def run(self):
global alive_threads, pages_scanned
self.connection = sqlite3.Connection("bin/dbfile",
detect_types=sqlite3.PARSE_COLNAMES)
self.connection.text_factory = str
self.c = self.connection.cursor()
self.titles, self.bodies, self.posttimes = [], [], []
time.sleep(random.random())
print "Scanning:", self.city, self.category
scanned_links = self.link_scanner()
self.titles = self.page_scanner(scanned_links)
self.c.close()
lock.acquire()
alive_threads-=1 # de-increment under lock
print "Done scanning %s. %d threads still active." % (self.city, alive_threads)
pages_scanned = pages_scanned + len(self.titles)# increment # of pages scanned
lock.release()
return
@urllib_retry.retry(urllib2.URLError, tries=2, delay=3)
@urllib_retry.retry(socket.timeout, tries=2, delay=3)
def link_scanner(self):
'''scan each starting_URL for links
'''
url = "http://"+self.city+".craigslist.org/search/"+self.category+"?format=rss"
html = opener.open(url, timeout=3).read() # open the starting page
scanned_links = re.findall("<link>(.*?)</link>", html) # find each post link
for link in scanned_links[:]: #slice new copy, maintains original
self.c.execute("SELECT Id FROM cl WHERE Links = ?", (link,))
data = self.c.fetchone()
if data is not None: # if there is data... duplicate url found,
scanned_links.remove(link) # remove it from the unsliced original
return scanned_links
@urllib_retry.retry(urllib2.URLError, tries=2, delay=3)
@urllib_retry.retry(socket.timeout, tries=2, delay=3)
def page_scanner(self, scanned_links):
"""Scan each link for title, body, etc
"""
global q
if len(scanned_links) > 1:
print ("%d new files in %s's %s." % (len(scanned_links)-1,
self.city,
self.category))
for scanned_link in scanned_links:
if scanned_link[-3:] == "rss": # skip scanning the rss feed
pass
else:
html = opener.open(scanned_link, timeout=3).read()
print "-"*80,"\nScanning:", scanned_link
# pulls the title from each scanned_URL
self.titles.append(re.findall(r'<title>(.*?)</title>',html))
self.bodies.append(re.findall(r'<section id="postingbody">(.*?)</section>',
html ,re.DOTALL)) # DOTALL . matches \n
self.posttimes.append(re.findall(r'Posted:.*"(.*?)T(.*?)-.*?</time>',
html ,re.DOTALL))
# remove the show contact info info, remove the <br>s
# and strip the whitespace
target = '<a href=".*" class="showcontact" \
title="click to show contact info" \
rel="nofollow">show contact info</a>'
source = str(self.bodies[-1][0].replace('<br>','').strip())
#NOTE bodies/titles[-1] is a list? [-1][0] is the string??
#List of list of strings?
self.bodies[-1][0] = re.sub(target, '', source)
self.date = datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S")
# print "SCAN:\t",self.date,
# "\nPOST:\t", self.posttimes[-1][:],
# "\nTITLE:\t",self.titles[-1][0],
# "\nBODY:\t",self.bodies[-1][0],
# "\n"
# insert data into db, cleanup bodies
# self.c.execute("INSERT INTO cl (Scanned, Posted, Links, Titles, Bodies)\
# VALUES (?, ?, ?, ?, ?)",
# (self.date,
# str(self.posttimes[-1]),
# scanned_link,
# str(self.titles[-1][0]),
# str(self.bodies[-1][0].replace('<br>', '').strip())))
# self.connection.commit() # move outside loop w/ good internet
q.put(self.titles)
print scanned_link, "Queued not written." #NOTE, uncomment sql to log
# write and remove each entry to get around crappy connection
scanned_links.remove(scanned_link)
else:
print ("\n0 new files in %s's %s." % (self.city, self.category))
return self.titles
if __name__ == '__main__':
print "%d cities to scan" % len(cities)
for city in cities:
for category in categories:
t = worker_thread(city, category)
t.start()
time.sleep(.1) # need a pause to increment the first alive_threads
while alive_threads > 0:
pass
while not q.empty():
d = q.get()
print d
print "\n","-"*88
print ("Scan of %d items completed at: %s" % (pages_scanned,
datetime.strftime(datetime.now(),
"%Y-%m-%d %H:%M:%S")))
#!/usr/bin/env python
# http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
import time
from functools import wraps
def retry(ExceptionToCheck, tries=4, delay=3, backoff=2, logger=None):
"""Retry calling the decorated function using an exponential backoff.
http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
original from: http://wiki.python.org/moin/PythonDecoratorLibrary#Retry
:param ExceptionToCheck: the exception to check. may be a tuple of
exceptions to check
:type ExceptionToCheck: Exception or tuple
:param tries: number of times to try (not retry) before giving up
:type tries: int
:param delay: initial delay between retries in seconds
:type delay: int
:param backoff: backoff multiplier e.g. value of 2 will double the delay
each retry
:type backoff: int
:param logger: logger to use. If None, print
:type logger: logging.Logger instance
"""
def deco_retry(f):
@wraps(f)
def f_retry(*args, **kwargs):
mtries, mdelay = tries, delay
while mtries > 1:
try:
return f(*args, **kwargs)
except ExceptionToCheck, e:
msg = "%s, Retrying in %d seconds..." % (str(e), mdelay)
if logger:
logger.warning(msg)
else:
print msg
time.sleep(mdelay)
mtries -= 1
mdelay *= backoff
return f(*args, **kwargs)
return f_retry # true decorator
return deco_retry