[Home] [Articles, Categories, Tags] [Books, Quotes]
Craigslist Scanner
Tags:
Posted: 2014-10-21
Last Update: 2014-10-21

Threads in python only let you "stack" the network delays.

No interface, simply edit the file to get what you wanted, quick and dirty.

Hasn't been used in over a year.

clscan.py
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#!/usr/bin/env python
# Scans Craigslist city/category RSS pages and saves new data to SQLite 
#   (file:dbfile, table:cl)
# 2014/10/21
# 2014/11/01 - added concurrent threads, moved to project folder, added urllib_retry
# 2016-09-14 - updated, formattting, git tracking, getting ready to use in portfolio
"""Logs all posts in a given city and category to SQLite"""
import cookielib
from cookielib import CookieJar
from datetime import datetime
try: import Queue
except: import queue as Queue
import random # to randomly stagger threads for better output
import re
import socket # for timeout/retry
import sqlite3
import string
import thread
import threading
import time
import urllib2
import urllib_retry # local module
# browser setup
cj = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) 
opener.addheaders = [('User-agent','Mozilla/5.0')] # fake header
# table setup
#connection = sqlite3.Connection("dbfile", detect_types=sqlite3.PARSE_COLNAMES)
#c = connection.cursor()
# if the cl table exists, drop it (makes new table each run for testing)
#c.execute("DROP TABLE IF EXISTS cl")
#c.execute("CREATE TABLE cl (Id INTEGER PRIMARY KEY NOT NULL,
#                            Scanned TEXT,
#                            Posted TEXT,
#                            Links TEXT,
#                            Titles TEXT,
#                            Bodies TEXT)") # make a new cl table

# enter city(s) and category(s)
cities = ['charleston','flagstaff','dallas','boston','miami','lasvegas']
#,'madison','pittsburgh','chicago','austin','neworleans']
#,'atlanta','newyork','seattle']
categories = ['rid','act']

q = Queue.Queue()

alive_threads = 0 # track how many threads are alive
lock = thread.allocate_lock() # create a lock object
pages_scanned = 0

class worker_thread(threading.Thread):

  def __init__(self, city, category):
    global alive_threads, pages_scanned
    threading.Thread.__init__(self)
    self.city = city
    self.category = category

    lock.acquire()
    alive_threads+=1 # increment under lock
    lock.release()

  def run(self):
    global alive_threads, pages_scanned
    self.connection = sqlite3.Connection("bin/dbfile",
                                         detect_types=sqlite3.PARSE_COLNAMES)
    self.connection.text_factory = str
    self.c = self.connection.cursor()
    self.titles, self.bodies, self.posttimes = [], [], []

    time.sleep(random.random())
    print "Scanning:", self.city, self.category
    scanned_links = self.link_scanner()
    self.titles = self.page_scanner(scanned_links)
    self.c.close()

    lock.acquire()
    alive_threads-=1 # de-increment under lock
    print "Done scanning %s. %d threads still active." % (self.city, alive_threads)
    pages_scanned = pages_scanned + len(self.titles)# increment # of pages scanned
    lock.release()
    return

  @urllib_retry.retry(urllib2.URLError, tries=2, delay=3)
  @urllib_retry.retry(socket.timeout, tries=2, delay=3)
  def link_scanner(self):
    '''scan each starting_URL for links
    '''
    url = "http://"+self.city+".craigslist.org/search/"+self.category+"?format=rss"
    html = opener.open(url, timeout=3).read() # open the starting page
    scanned_links = re.findall("<link>(.*?)</link>", html) # find each post link
    for link in scanned_links[:]: #slice new copy, maintains original
      self.c.execute("SELECT Id FROM cl WHERE Links = ?", (link,))
      data = self.c.fetchone()
      if data is not None:         # if there is data... duplicate url found,
    scanned_links.remove(link) # remove it from the unsliced original
    return scanned_links

  @urllib_retry.retry(urllib2.URLError, tries=2, delay=3)
  @urllib_retry.retry(socket.timeout, tries=2, delay=3)
  def page_scanner(self, scanned_links):
    """Scan each link for title, body, etc
    """
    global q
    if len(scanned_links) > 1:
      print ("%d new files in %s's %s." % (len(scanned_links)-1,
                                           self.city,
                                           self.category))
      for scanned_link in scanned_links:
    if scanned_link[-3:] == "rss": # skip scanning the rss feed
      pass
    else:
      html = opener.open(scanned_link, timeout=3).read()
      print "-"*80,"\nScanning:", scanned_link
      # pulls the title from each scanned_URL
      self.titles.append(re.findall(r'<title>(.*?)</title>',html))
      self.bodies.append(re.findall(r'<section id="postingbody">(.*?)</section>',
                             html ,re.DOTALL)) # DOTALL . matches \n
      self.posttimes.append(re.findall(r'Posted:.*"(.*?)T(.*?)-.*?</time>',
                                html ,re.DOTALL))
      # remove the show contact info info, remove the <br>s
          # and strip the whitespace
      target = '<a href=".*" class="showcontact" \
                     title="click to show contact info" \
                     rel="nofollow">show contact info</a>'
      source = str(self.bodies[-1][0].replace('<br>','').strip())
          #NOTE bodies/titles[-1] is a list? [-1][0] is the string?? 
          #List of list of strings?
      self.bodies[-1][0] = re.sub(target, '', source)
      self.date = datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S")

#          print "SCAN:\t",self.date,
#                "\nPOST:\t", self.posttimes[-1][:],
#                "\nTITLE:\t",self.titles[-1][0],
#                "\nBODY:\t",self.bodies[-1][0],
#                "\n"

#     insert data into db, cleanup bodies
#          self.c.execute("INSERT INTO cl (Scanned, Posted, Links, Titles, Bodies)\
#                          VALUES (?, ?, ?, ?, ?)",
#                          (self.date,
#                      str(self.posttimes[-1]),
#                          scanned_link,
#                          str(self.titles[-1][0]),
#                          str(self.bodies[-1][0].replace('<br>', '').strip())))
#     self.connection.commit()   # move outside loop w/ good internet
      q.put(self.titles)
      print scanned_link, "Queued not written." #NOTE, uncomment sql to log
          # write and remove each entry to get around crappy connection
      scanned_links.remove(scanned_link)
      else:
    print ("\n0 new files in %s's %s." % (self.city, self.category))
      return self.titles

if __name__ == '__main__':
  print "%d cities to scan" % len(cities)
  for city in cities:
    for category in categories:
      t = worker_thread(city, category) 
      t.start()
  time.sleep(.1) # need a pause to increment the first alive_threads
  while alive_threads > 0:
    pass
  while not q.empty():
    d = q.get()
    print d
  print "\n","-"*88
  print ("Scan of %d items completed at: %s" % (pages_scanned,
                                               datetime.strftime(datetime.now(),
                                                          "%Y-%m-%d %H:%M:%S")))
urllib_retry.py
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/usr/bin/env python
# http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
import time
from functools import wraps
def retry(ExceptionToCheck, tries=4, delay=3, backoff=2, logger=None):
    """Retry calling the decorated function using an exponential backoff.

    http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
    original from: http://wiki.python.org/moin/PythonDecoratorLibrary#Retry

    :param ExceptionToCheck: the exception to check. may be a tuple of
        exceptions to check
    :type ExceptionToCheck: Exception or tuple
    :param tries: number of times to try (not retry) before giving up
    :type tries: int
    :param delay: initial delay between retries in seconds
    :type delay: int
    :param backoff: backoff multiplier e.g. value of 2 will double the delay
        each retry
    :type backoff: int
    :param logger: logger to use. If None, print
    :type logger: logging.Logger instance
    """
    def deco_retry(f):
        @wraps(f)
        def f_retry(*args, **kwargs):
            mtries, mdelay = tries, delay
            while mtries > 1:
                try:
                    return f(*args, **kwargs)
                except ExceptionToCheck, e:
                    msg = "%s, Retrying in %d seconds..." % (str(e), mdelay)
                    if logger:
                        logger.warning(msg)
                    else:
                        print msg
                    time.sleep(mdelay)
                    mtries -= 1
                    mdelay *= backoff
            return f(*args, **kwargs)
        return f_retry  # true decorator
    return deco_retry

TODO:











[About] [Contact]