Source code for pacer_lib.reader

import codecs
import cStringIO
import csv
import datetime
import json
import os
import requests
import re
import time
import html5lib
from bs4 import BeautifulSoup, Comment

#Implement mySQL stuff
# Handle stuff
#ALlow for multiple docket_paths?
#Should also return meta-data from the download header

[docs]class docket_parser():
    """
    Returns a docket_parser object that provides functions which allow you to
    quickly load .html PACER docket sheets from the specified docket_path 
    parse metadata (about both the download of the docket as well as the 
    characteristics of the case), and convert into a machine-readable format
    (CSV)
    
    This object is built on top of BeautifulSoup 4.
    
    **Keyword Arguments:**

    * ``docket_path``: which specifies a relative path to the storage of dockets
      (i.e., input data); dockets shoudl be in .html format
    
    * ``output_path``: which specifies a relative path to the folder where output
      should be written. If this folder does not exist, it will be created. If the
      two subfolders (``/case_meta/`` and ``/download_meta``) do not exist within
      the output_path, then they will also be created.

    """
    def __init__(self, docket_path='./results/local_docket_archive',
                 output_path='./results'):
        self.docket_path = docket_path

        self.bugged_path = os.path.abspath(output_path + '/bugged_dockets/') 
        self.output_path = os.path.abspath(output_path + '/parsed_dockets/')
        self.output_meta_path = os.path.abspath(output_path + '/parsed_dockets_meta/')

        #Check that the output path exists and create it if it doesn't
        if not os.path.exists(output_path):
            os.makedirs(self.output_path)
            os.makedirs(self.output_meta_path)
            os.makedirs(self.bugged_path)
        elif (not os.path.exists(self.output_path) or 
              not os.path.exists(self.output_meta_path) or
              not os.path.exists(self.bugged_path)):
            if not os.path.exists(self.output_path):
                os.makedirs(self.output_path)
            if not os.path.exists(self.output_meta_path):
                os.makedirs(self.output_meta_path)
                os.makedirs(self.output_meta_path + '/case_meta/')
                os.makedirs(self.output_meta_path + '/download_meta/')
            if not os.path.exists(self.bugged_path):
                os.makedirs(self.bugged_path)
                
[docs]    def parse_data(self, data):
        """
        Returns a list of all of the docket entries in ``data``, which should be
        a string literal. BeautifulSoup is useed to parse a .html docket file 
        (pass as a string literal through ``data``) into a list
        of docket entries. Each docket entry is also a list.

        This uses html.parser and, in the case of failure, switches to html5lib. 
        
        If it cannot find the table or entries, it will return a string as an
        error message.

        **Keyword Arguments**

        * ``data``: should be a string, read from a .html file.
        
        **Output Documentation**

        0. date_filed
        1. document_number
        2. docket_description
        3. link_exist (this is a dummy to indicate the existence of a link)
        4. document_link (docket_number does not uniquely identify the docket
           entry so we also create a separate unique identifier)
        5. unique_id (document_number is not a unique identifier so we create
           one based on the placement in the .html docket sheet)
        
        """  
        parsed_docket_table = []
    
        # Open the .html docket file and parse using BeautifulSoup 
        # into a list of entries

        # While most files can be processed with html.parser, some files
        # cannot. In those cases, we switch to the more lenient html5lib.

        # A. Make Soup.
        source = BeautifulSoup(data)
        for s in source('script'): s.extract()      #Remove Script Elements
        
        # B. Identify the table.
        docket_table = source.find('table', {'rules':'all'})
        if not docket_table: # Rerun in case we can't find the table
            source = BeautifulSoup(data, "html5lib")
            for s in source('script'): s.extract()
            docket_table = source.find('table', {'rules':'all'})
            if not docket_table: # If still can't find the table, return error.
                return "Error, could not find docket_table."

        # C. Find the entries.
        docket_entries = docket_table.find_all('tr')
        if not docket_entries: #Rerun in case we found the table, but no rows
            source = BeautifulSoup(data, "html5lib")
            for s in source('script'): s.extract()
            docket_table = source.find('table', {'rules':'all'})
            docket_entries = docket_table.find_all('tr')
            if not docket_entries: # If still can't find entries, return error.
                return "Error, could not find docket_entries."

        # D. Parse each entry into a list of characteristics and append to the 
        # parsed_docket_table
        skip_first_line = 0
        for entry in docket_entries:
            if skip_first_line == 0:
                skip_first_line = 1
                continue

            # Turn the docket entry into a list     
            row_cells = entry.find_all('td')
            row_contents = [c.get_text(' ', strip=True) for c in row_cells]
            row_contents = [c.replace('\t','').replace('\r\n','') for c 
                            in row_contents]

            # Truncate extremely long cells:
            for n, content in enumerate(row_contents):
                if len(content) > 20000:
                    row_contents[n] = content[0:20001] + "(TRUNCATED)"

            # Replace missing information
            if row_contents[0] == '':row_contents[0]='NA'
            if row_contents[1] == '':row_contents[1]='NA'
            
            # Search for the link to a document
            link = row_cells[1].find('a')
            if link:
                link_exist = '1'
                link = link.get('href')
            else:
                link_exist = '0'
                link = ''
                
            row_contents.extend([link_exist, link])
            parsed_docket_table.append(row_contents)

        for number, line in enumerate(parsed_docket_table):
            parsed_docket_table[number].append(str(number))

        return parsed_docket_table

[docs]    def extract_download_meta(self, data):
        """
        Returns a dictionary that contains all of the downloadmeta that was
        stored by ``pacer_lib.scraper()`` at the time of download (i.e., the
        *detailed_info* json object that is commented out at the top of new
        downloads from PACER). This is meant to help improve reproducibility.

        *detailed_info* is an add-on in later versions of pacer_lib that
        records case-level data from the search screen (date_closed, link, 
        nature of suit, case-name, etc.) as well as the date and time of
        download.
        
        In earlier versions of pacer_lib (i.e., released as 
        pacer_scraper_library), this was stored as a list and did not include
        the date and time of download. ``extract_download_meta()`` can also
        handle these *detailed_info* objects.
        
        If there is no *detailed_info*, the function returns a dictionary with
        the key 'Error_download_meta'.
        
        **Keyword Arguments**

        * ``data``: should be a string, read from a .html file.
        
        **Output Documentation**
        Unless otherwise noted, all of these are collected from the PACER
        Case Locator results page. This is documented as 
        ``key``: description of value.        
        
        These terms are found in documents downloaded by any version of pacer_lib:

        * ``searched_case_no``: the case number that was passed to pacer_lib.scraper(),
          this is recorded to ensure reproducibility and comes from pacer_lib. 
          This is not found on the PACER Case Locator results page.

        * ``court_id``: the abbreviation for the court the case was located in
        * ``case_name``: the name of the case, as recorded by PACER
        * ``nos``: a code for "Nature of Suit"
        * ``date_filed``: the date the case was filed, as recorded by PACER
        * ``date_closed``: the date the case was closed, as recorded by PACER
        * ``link``: a link to the docket
        
        These are only in documents downloaded with newer versions of 
        pacer_lib:

        * ``downloaded``: string that describes the time the docket was 
          downloaded by pacer_lib. This is not found on the PACER Case Locator 
          results page. (Format: yyyy-mm-dd,hh:mm:ss)
        * ``listed_case_no``: string that describes the preferred PACER case no
          for this case (as opposed to the query we submitted)
        * ``result_no``: which result was the case on the PACER Case Locator
          results page.

        """
        source = BeautifulSoup(data)
        r = source.find(text=lambda text:isinstance(text, Comment))
        if r:
            # New detailed_info
            if "detailed_info:" in r and '{' in r:
                r = r.replace('detailed_info:\n','').replace("\"\"", "\"")
                r = r.replace(" \"", " \\\"").replace("\" ","\\\" ") # fix internal quotation marks
                r = r.replace(" \'", " \\'").replace("\' ","\\' ") .replace("\'","\\\'") # fix internal quotation marks
                r = r.replace("\\\',\\\'", " \',\'").replace("\\\':\\\'", " \':\'").replace("{\\\'", "{\'").replace("\\\'}", "\'}")
                detailed_info = eval(r)

            # Legacy detailed_info
            elif "detailed_info:" in r and '(' in r:
                r = r.replace('detailed_info:', '')
                r = r.replace('(','[').replace(')',']').replace("\"\"", "\"").strip()
                r = r.replace(" \"", " \\\"").replace("\" ","\\\" ") # fix internal quotation marks
                r = r.replace(" \'", " \\\'").replace("\' ","\\\' ").replace("\'","\\\'") # fix internal quotation marks
                r = r.replace("\\\',\\\'", " \',\'").replace("\\\':\\\'", " \':\'").replace("{\\\'", "{\'").replace("\\\'}", "\'}")
                temp = eval(r)
                detailed_info = {'searched_case_no':temp[0],
                                 'court_id':temp[1],
                                 'case_name':temp[2],
                                 'nos':temp[3],
                                 'date_filed':temp[4],
                                 'date_closed':temp[5],
                                 'link':temp[6]}
            else:
                return {'Error_download_meta' : '\'detailed_info:\' not found'}
        else:
            return {'Error_download_meta' : 'no comments in source code'}

        return detailed_info

[docs]    def extract_lawyer_meta(self, data):
        """
        Returns a dictionary of information about the plaintiff, defendant and
        their lawyers extracted from an .html docket (passed as a string literal
        through ``data``).
        
        At the moment, ``extract_lawyer_meta()`` only handles the most common
        listing (i.e., if there is one listing for plaintiff and one listing
        for defendant). If there is more than one set of plaintiffs or
        defendants (e.g., in a class action suit), the function will return a 
        dictionary with a single key *Error_lawyer_meta*. This function will not
        handle movants and will probably not handle class-action cases. 
        
        In dockets downloaded from older versions of pacer_lib 
        (e.g., pacer_scraper_library), lawyer information was not requested so
        the dockets will not contain any lawyer_meta to be extracted.
        
        **Output Documentation**
        This is documented as ``key``: description of value.
        
        * ``plaintiffs``: list of the names of plaintiffs
        * ``defendants``: list of the names of defendants
        * ``plaintiffs_attorneys``: list of the name of attorneys representing
          the plaintiffs
        * ``defendants_attorneys``: list of the name of attorneys representing 
          the defendants
        * ``plaintiffs_attorneys_details``: string that contains the cleaned
          output of all plaintiff lawyer data (e.g., firm, address, email, etc.)
          that can be further cleaned in the future.
        * ``defendants_attorneys_details``: string that contains the cleaned
          output of all defendant lawyer data (e.g., firm, address, email, etc.)
          that can be further cleaned in the future.

        """
        source = BeautifulSoup(data)
        table_id = {'border':'0',
                    'cellspacing':'5',
                    'width':'100%'}
        tables = source.find_all('table', table_id)

        lawyer_table = ''
        # Identify the base table that holds lawyer meta
        base = ''
        for table in tables:
            filter_text = table.get_text().lower()
            # Skip the top meta info
            if ("jury demand" in filter_text
                or "date filed" in filter_text
                or "docket text" in filter_text):
                continue

            if ("plaintiff" in filter_text 
                and "defendant" in filter_text
                and "represented" in filter_text):
                base = table

        if not base:
            return {'Error_lawyer_meta' : 'Could not identify table'}

        rows = base.find_all('tr')

        #Keep track of the parsing
        plaintiff_row = ''
        defendant_row = ''
        parse_state = 0

        for row in rows:
            # Change the state as we pass different rows
            if "Plaintiff" in row.get_text():
                parse_state = 1
                continue
            if "Defendant" in row.get_text():
                parse_state = 2
                continue
            if "Fictitious Defendant" in row.get_text():
                parse_state = 3

            # Skip empty rows
            if not row.get_text().strip():
                continue

            # Assign the next non-empty row to plaintiff and defendant
            # The order should not matter
            if parse_state == 1 and not plaintiff_row:
                plaintiff_row = row

            if parse_state == 2 and not defendant_row:
                defendant_row = row

        # Return an error if we don't find any applicable rows.
        if parse_state == 0:
            return {'Error_lawyer_meta' : 'Could not identify any rows.'}

        plaintiff_cells = plaintiff_row.find_all('td', {'width':'40%'})
        defendant_cells = defendant_row.find_all('td', {'width':'40%'})
        
        if len(defendant_cells) != 2 or len(plaintiff_cells) != 2:
            return {'Error_lawyer_meta' : 'Too many cells or not enough cells. Check source.'}

        # Clean the plaintiffs names.
        plaintiffs = plaintiff_cells[0].find_all('b')
        for n, name in enumerate(plaintiffs):
            new_name = name.get_text().strip()
            new_name = new_name.replace('\t', ' ')
            while '  ' in new_name:
                new_name = new_name.replace('  ', ' ')
            plaintiffs[n] = new_name

        plaintiffs_attorneys = plaintiff_cells[1].find_all('b')
        for n, name in enumerate(plaintiffs_attorneys):
            new_name = name.get_text().strip()
            new_name = new_name.replace('\t', ' ')
            while '  ' in new_name:
                new_name = new_name.replace('  ', ' ')
            plaintiffs_attorneys[n] = new_name

        plaintiffs_attorneys_details = plaintiff_cells[1].get_text().strip()
        plaintiffs_attorneys_details = plaintiffs_attorneys_details.replace('\r', '')
        while '  ' in plaintiffs_attorneys_details:
            plaintiffs_attorneys_details = plaintiffs_attorneys_details.replace('  ', ' ')

        while '\n ' in plaintiffs_attorneys_details:
            plaintiffs_attorneys_details =  plaintiffs_attorneys_details.replace('\n ', '\n')

        while '\n\n' in plaintiffs_attorneys_details:
            plaintiffs_attorneys_details = plaintiffs_attorneys_details.replace('\n\n', '\n')

        # Clean the defendants names
        defendants = defendant_cells[0].find_all('b')
        for n, name in enumerate(defendants):
            new_name = name.get_text().strip()
            new_name = new_name.replace('\t', ' ')
            while '  ' in new_name:
                new_name = new_name.replace('  ', ' ')
            defendants[n] = new_name

        defendants_attorneys = defendant_cells[1].find_all('b')
        for n, name in enumerate(defendants_attorneys):
            new_name = name.get_text().strip()
            new_name = new_name.replace('\t', ' ')
            while '  ' in new_name:
                new_name = new_name.replace('  ', ' ')
            defendants_attorneys[n] = new_name

        defendants_attorneys_details = defendant_cells[1].get_text().strip()
        defendants_attorneys_details = defendants_attorneys_details.replace('\r', '')
        while '  ' in defendants_attorneys_details:
            defendants_attorneys_details = defendants_attorneys_details.replace('  ', ' ')

        while '\n ' in defendants_attorneys_details:
            defendants_attorneys_details =  defendants_attorneys_details.replace('\n ', '\n')

        while '\n\n' in defendants_attorneys_details:
            defendants_attorneys_details = defendants_attorneys_details.replace('\n\n', '\n')
        
        lawyer_meta_dict = {'plaintiffs':plaintiffs,
                            'plaintiffs_attorneys':plaintiffs_attorneys,
                            'plaintiffs_attorneys_details':plaintiffs_attorneys_details,
                            'defendants':defendants,
                            'defendants_attorneys':defendants_attorneys,
                            'defendants_attorneys_details':defendants_attorneys_details}

        return lawyer_meta_dict

[docs]    def extract_case_meta(self, data):
        """
        Returns a dictionary of case information (e.g., case_name, demand,
        nature of suit, jurisdiction, assigned judge, etc.) extracted from
        an .html docket (passed as a string literal through ``data``). This
        information should be available in all dockets downloaded from PACER.

        This information may overlap with information from 
        ``extract_download_meta()``, but it is technically extracted from a 
        different source (the docket sheet, rather than the results page of the
        PACER Case Locator).
        
        In consolidated cases, there is information about the
        lead case, and a link. We extract any links in the case_meta section of
        the document and store it in the dictionary with the key *meta_links*.

        There are some encoding issues with characters such as \xc3 that we have
        tried to address, but may need to be improved in the future.
        
        If ``extract_case_meta()`` cannot find the case_meta section of the
        docket, it will return a dictionary with a single key, 
        *Error_case_meta*.
        
        **Output Documentation**
        Please note that ``extract_case_meta`` does common cleaning and then
        treats each (text):(text) line as a key:value pair, so this 
        documentation only documents the most common keys that we have observed.

        These keys are, generally, self-explanatory and are only listed for
        convenience.
        
        * ``Case name``
        * ``Assigned to``
        * ``Referred to``
        * ``Demand``
        * ``Case in other court``
        * ``Cause``
        * ``Date Filed``
        * ``Date Terminated``
        * ``Jury Demand``
        * ``Nature of Suit``
        * ``Jurisdiction``
        
        Special keys:

        * ``Member case``: the existence of this key indicates that this is 
          probably the lead case of a consolidated case.
        * ``Lead case``: the existence of this key indicates that this is
          probably a member case of a consolidated case.
        * ``meta_links``: this will only exists if there are links in the
          case_meta section of the PACER docket.
        
        """

        source = BeautifulSoup(data)
        case_meta = ''
        case_meta_dict = {}
        meta_links = []

        # Find the correct cells (split into two columns)
        left_column = source.find_all('td', {'valign':'top', 'width':'60%'})
        for cell in left_column:
            if "Assigned to" in cell.prettify():
                case_meta += cell.text.strip()
                #Extract left column links
                links = cell.find_all('a')
                for link in links:
                    meta_links.append((link.text, link['href']))
                    
        right_column = source.find_all('td', {'valign':'top', 'width':'40%'})
        for cell in right_column:
            if "Date Filed:" in cell.prettify():
                case_meta += cell.text.strip()

        if case_meta == '':
            return {'Error_case_meta' : 'case_meta string not found in columns'}

        #1. CLEAN the case_meta string
        #strip leading whitespace
        case_meta = case_meta.strip()

        #remove carriage returns
        case_meta = case_meta.replace('\r', '')

        #remove double spaces
        while '  ' in case_meta:
            case_meta = case_meta.replace('  ', ' ')

        #remove leading spaces
        while '\n ' in case_meta:
            case_meta = case_meta.replace('\n ', '\n')

        #remove double line breaks
        while '\n\n' in case_meta:
            case_meta = case_meta.replace('\n\n', '\n')

        #remove problem strings
        case_meta = case_meta.replace(':\n', ': ')


        #2. PARSE the case_meta string into a case_meta list
        # and finally into a case_meta dictionary
        case_meta = case_meta.split('\n')
        for n, item in enumerate(case_meta):
            if u'\xa0' in item:
                case_meta[n] = item.replace(u'\xa0', u' ')
            if u'\xc3' in item:
                case_meta[n] = item.replace(u'\c3', '')

            case_meta[n] = case_meta[n].split(':')
            for n2, subitem in enumerate(case_meta[n]):
                case_meta[n][n2] = subitem.strip()

            if len(case_meta[n]) == 1:
                case_meta_dict['Case name'] = case_meta[n][0]
            elif len(case_meta[n]) > 2:
                case_meta_dict[case_meta[n][0]] = ':'.join(case_meta[n][1:len(case_meta[n])])
            else:
                case_meta_dict[case_meta[n][0]] = case_meta[n][1]
        
        #3. Add all links in
        if meta_links:
            case_meta_dict['meta_links'] = meta_links
        return case_meta_dict

[docs]    def extract_all_meta(self, data, debug=False):
        """
        Returns two dictionaries, one that has download_meta and one that 
        contains meta extracted from the docket. ``extract_all_meta()`` runs 
        ``extract_case_meta()``, ``extract_lawyer_meta()`` and 
        ``extract_download_meta()`` on ``data`` (a string literal of an .html
        document). It returns two dictionaries (one containing download_meta
        and one containing both case_meta and lawyer_meta) because download_meta
        and case_meta have overlapping information. 
        
        If debug is not turned on, extract_all_meta will ignore any
        error output from the sub functions (e.g., if the functions cannot find
        the relevant sections).
        
        **Output Documentation**
        See the output documentation of  ``extract_case_meta()``, 
        ``extract_lawyer_meta()`` and ``extract_download_meta()``.
        """
        #Check for errors.
        download_meta = self.extract_download_meta(data)
        if "Error_download_meta" in download_meta and not debug:
            download_meta = {}

        case_meta = self.extract_case_meta(data)
        if "Error_case_meta" in case_meta and not debug:
            case_meta = {}

        lawyer_meta = self.extract_lawyer_meta(data)
        if "Error_lawyer_meta" in lawyer_meta and not debug:
            lawyer_meta = {}

        # Check for duplicate keys
        l_c_keys = [key for key in lawyer_meta if key in case_meta]
        if l_c_keys:
           return download_meta, {'Error':'Key Conflicts'}
        else:
            docket_meta = dict(case_meta.items() + lawyer_meta.items())

        return  download_meta, docket_meta

[docs]    def parse_dir(self, overwrite=True, get_meta=True):
        """
        Run ``parse_data()`` and ``extract_all_meta()`` on each file in the 
        docket_path folder and writes the output to the output_path. 
        
        **Output Documentation**
        This function returns nothing.
        
        **File documentation**
        The docket entries of each docket are stored as a .csv in a folder
        'processed_dockets'. The filename of the csv indicates the source docket
        and the columns represent (in order):
        
        0. date_filed
        1. document_number
        2. docket_description
        3. link_exist (this is a dummy to indicate the existence of a link)
        4. document_link (docket_number does not uniquely identify the docket
           entry so we also create a separate unique identifier)
        5. unique_id (document_number is not a unique identifier so we create
           one based on the placement in the .html docket sheet)
        
        The download meta and case and lawyer meta information of each docket 
        is stored as a JSON-object in the sub-folders 
        'processed_dockets_meta/download_meta/' and
        'processed_dockets_meta/case_meta/' within the output path. The files
        indicate the source docket and are prefixed by **download_meta_** and
        **case_meta_**, respectively.
        """
        csv_headers = ['date_filed', 'document_number', 'docket_description', 
                       'link_exist', 'document_link', 'unique_id']

        # Check for all of the files that have been downloaded
        for dir, list, files in os.walk(self.docket_path):
            for file in files:
                output_filename = (self.output_path + '/' +  
                                   file.replace('html', 'csv'))
                if get_meta:
                    case_meta_filename = (self.output_meta_path + '/case_meta/case_meta_' +
                                          file.replace('html', 'json'))
                    download_meta_filename = (self.output_meta_path + 
                                              '/download_meta/download_meta_' + 
                                              file.replace('html', 'json'))

                #If the file exists and we have been told not to overwrite, skip.
                if overwrite or not os.path.exists(output_filename):
                    with open(self.docket_path + '/' + file, 'r') as input:
                        source = input.read()
                        if get_meta:
                            download_meta, case_meta = self.extract_all_meta(source)
                        content = self.parse_data(source)

                        # Error handling; copy the docket out and continue.
                        if (content == "Error, could not find docket_table." or
                            content == "Error, could not find docket_entries."):
                            print file, content
                            with open(self.bugged_path + '/' + file, 'w') as bugged:
                                bugged.write(source)
                            continue

                        #Add the number of download entries
                        if get_meta:
                            case_meta['docket_entries'] = len(content)

                        with codecs.open(output_filename, 'w') as output:
                            writer = UnicodeWriter(output, dialect='excel')
                            writer.writerow(csv_headers)
                            writer.writerows(content)

                        if get_meta:
                            with codecs.open(download_meta_filename, 'w') as output:
                                json.dump(download_meta, output)

                            with codecs.open(case_meta_filename, 'w') as output:
                                json.dump(case_meta, output)                                    

[docs]class docket_processor():
    """
    Returns a ``docket_processor()`` object that allows for keyword and boolean
    searching of docket entries from dockets specified in *processed_path*. 
    ``docket_processor`` relies on the use of `docket_parser`` to
    parse .html PACER dockets into structured .csv, although it is theoretically
    possible (but quite tedious) to independently bring dockets into compliance
    for use with ``docket_processor``.
    
    This will give you a set of documents (and their associated links) for
    download (and which can be passed to pacer_lib.scraper()).
    
    The object then outputs a docket-level or consolidated .csv that describes
    all documents that meet the search criteria (stored in *hit_list*).
    
    **Keyword Arguments**

    * ``processed_path`` points to the folder containing .csv docket files
    * ``output_path`` points to the folder where you would like output to be
      stored. Note that the output will actually be stored in a subfolder of the
      *output_path* called */docket_hits/*. If the folders do not exist, they will
      be created.

    """
    def __init__(self, processed_path='./results/parsed_dockets',
                 output_path='./results/'):
        self.processed_path = processed_path
        
        #Check that the output path exists
        if not os.path.exists(output_path + '/docket_hits'):
            os.makedirs(os.path.abspath(output_path) + "/docket_hits")

        self.output_path = os.path.abspath(output_path + '/docket_hits')

        #Initialize dictionary of Docket:Match Listing
        self.hit_list = {}
    
[docs]    def search_text(self, text, require_term=[], exclude_term=[], 
                    case_sensitive=False):
        """
        Returns a boolean indicating if all criteria are satisified in
        *text*. The criteria are determined in this way:
        
        * all strings in the list *require_term* are found in *text*
        * and, no strings in the list *exclude_term* are found in 
          *text*

        If you pass a string instead of a list to either *require_term* or 
        *exclude_term*, ``search_text()`` will convert it to a list.

        This search is, by default case-insensitive, but you can turn on
        case-sensitive search through *case_sensitive*.
        """
        # If there are neither required terms or exluded terms, you have
        # used this function incorrectly. You suck. Raises an error.
        if not require_term and not exclude_term:
            raise ValueError('You must search for at least one required or' +
                             'excluded term.')

        # If search terms are single strings, convert them to lists. 
        if isinstance(require_term, str):
            require_term = [require_term]
        if isinstance(exclude_term, str):
            exclude_term = [exclude_term]

        # Convert to case-insensitive
        if not case_sensitive:
            text = text.lower()
            for n, term in enumerate(require_term):
                require_term[n] = term.lower()
            for n, term in enumerate(exclude_term):
                exclude_term[n] = term.lower()

        #SEARCH THE TEXT
        term_match = True
        # Check that all required terms are in the text
        for term in require_term:
            if term not in text:
                term_match = False

        # Check that no excluded terms are in the text
        for term in exclude_term:
            if term in text:
                term_match = False

        return term_match

[docs]    def search_docket(self, docket, require_term=[], exclude_term=[], 
                      case_sensitive=False, within=0):
        """
        Returns a lists of docket entries that match the search criteria.
        Docket entries are lists that should have the same structure as 
        described in docket_parser, i.e. in order:

        0. date_filed
        1. document_number
        2. docket_description
        3. link_exist (this is a dummy to indicate the existence of a link)
        4. document_link (docket_number does not uniquely identify the docket
           entry so we also create a separate unique identifier)
        5. unique_id (document_number is not a unique identifier so we create
           one based on the placement in the .html docket sheet)

        The docket is specified by the argument *docket* and searched for 
        in the *self.processed_path* folder.

        The search criteria is specified by *require_term*, *exclude_term*, 
        *case_sensitive* and *within*, such that:

        * if *within* !=0, all searches are constrained to the first x
          characters of the text, where x = *within*
        * all strings in the list *require_term* are found in *text* 
          (or the first x charactersm, if *within* is used)
        * and, no strings in the list *exclude_term* are found in 
          *text* (or the first x charactersm, if *within* is used)
        * if *case_sensitive* =True, then the search is case sensitive

        """

        # Returns a list of entries (list of lists) that matches the search_term
        matched_list = []
        header_passed = False

        with open(self.processed_path + '/' + docket, 'r') as search_csv:
            docket_reader = csv.reader(search_csv, dialect='excel')
            for num, row in enumerate(docket_reader):
                # skip column headers
                if not header_passed: 
                    if row == ['docket_number', 'date_filed', 'document_number', 'docket_description', 
                               'link_exist', 'document_link', 'unique_id']:
                        header_passed = True
                        continue

                if within == 0:
                    if self.search_text(row[2], require_term, exclude_term, 
                                        case_sensitive):
                        matched_list.append(row)
                else:
                    in_char = within - 1
                    if self.search_text(row[2][:in_char], require_term, 
                                        exclude_term, case_sensitive):
                        matched_list.append(row)
        return matched_list
                
[docs]    def search_dir(self, require_term=[], exclude_term=[], case_sensitive=False, 
                   within=0):
        """
        Runs ``search_docket()`` on each docket in *self.processed_path* and
        adds hits to *self.hit_list* as a key value pair 
        *case_number* : *[docket entries]*, where *case_number* is taken from the
        filename and *[docket_entries]* is a list of docket entries (which are 
        also lists) that meet the search criteria.

        The search criteria is specified by *require_term*, *exclude_term*, 
        *case_sensitive* and *within*, such that:

        * if *within* !=0, all searches are constrained to the first x
          characters of the text, where x = *within*
        * all strings in the list *require_term* are found in *text* 
          (or the first x charactersm, if *within* is used)
        * and, no strings in the list *exclude_term* are found in 
          *text* (or the first x charactersm, if *within* is used)
        * if *case_sensitive* =True, then the search is case sensitive

        Returns nothing.
        """
        #Searches through the directory for all dockets with matching documents
        for root, dir, files in os.walk(self.processed_path):
            for file in files: 
                filename = file.replace('.csv', '')
                #Adds docket with matches to hit_list dictionary
                if filename not in self.hit_list:
                    matched_list = self.search_docket(file, require_term, 
                        exclude_term, case_sensitive, within)
                    if matched_list:
                        self.hit_list[filename] = matched_list
                else:
                    for match in self.search_docket(file, require_term, 
                                                    exclude_term, 
                                                    case_sensitive, within):
                        if match not in self.hit_list[filename]:
                            self.hit_list[filename].append(match)
        
        
[docs]    def write_all_matches(self, suffix, overwrite_flag = False):
        """
        Writes all of the matches found in the *self.hit_list* dictionary to 
        a single .csv file (**all_match__[suffix].csv**) in the *self.output_path*. 
        The columns of the .csv are (in order):

        0. case_number (as defined by the source .csv)
        1. date_filed
        2. document_number
        3. docket_description
        4. link_exist (this is a dummy to indicate the existence of a link)
        5. document_link (docket_number does not uniquely identify the docket
           entry so we also create a separate unique identifier)
        6. unique_id (document_number is not a unique identifier so we create
           one based on the placement in the .html docket sheet)

        There is a flag for overwriting.

        You cannot use ``/ \  % * : | " < > . _`` in the suffix.

        Returns nothing.
        """
        csv_headers = ['docket_number', 'date_filed', 'document_number', 'docket_description', 
                       'link_exist', 'document_link', 'unique_id']

        suffix = suffix.replace('_', '').replace('/','').replace('\\','')
        suffix = suffix.replace('?','').replace('?','').replace('%','')
        suffix = suffix.replace('*','').replace(':','').replace('|','')
        suffix = suffix.replace('\"','').replace('<','').replace('>','')
        suffix = suffix.replace('.','').replace(' ','')

        if not overwrite_flag:
            if os.path.exists(self.output_path + '/all_match__' +
                              suffix + '.csv'):
                raise IOError('A .csv with the suffix "'+ suffix + 
                              '" already exists. ' +
                              'Choose new suffix or specify '+
                              'overwrite_flag.')

        with open(self.output_path + '/all_match__' + suffix + '.csv', 'w') as f:
            writer = csv.writer(f, dialect= 'excel')
            writer.writerow(csv_headers)

            for key in self.hit_list.keys():
                for row in self.hit_list[key]:
                    temp = row
                    temp.insert(0, key)
                    writer.writerow(temp)

[docs]    def write_individual_matches(self, suffix, overwrite_flag = False):
        """
        Writes all of the matches in the *self.hit_list* dictionary to one .csv
        file per docket sheet (determined by the source .csv) in a folder named
        after the suffix. To distinguish
        from the source .csv, they are prefixed by a ^. They are also suffixed 
        to allow for multiple searches of the same source .csv. 

        Suffix is required and if the same suffix is specified, it will 
        overwrite previous searches if the overwrite flag is turned on.
        (It will delete all of the old files in the suffix folder.) 

        You cannot use ``/ \  % * : | " < > . _`` in the suffix.

        Returns nothing.
        """

        csv_headers = ['case_number', 'date_filed', 'document_number', 
                       'docket_description', 'link_exist', 'document_link', 
                       'unique_id']

        suffix = suffix.replace('_', '').replace('/','').replace('\\','')
        suffix = suffix.replace('?','').replace('?','').replace('%','')
        suffix = suffix.replace('*','').replace(':','').replace('|','')
        suffix = suffix.replace('\"','').replace('<','').replace('>','')
        suffix = suffix.replace('.','').replace(' ','')

        # Check if the directory exists. If not, create it. If it does,
        # check if we need to overwrite it. If we do need to overwrite,
        # delete everything in the folder.
        result_path = self.output_path + '/' + suffix
        if not os.path.exists(result_path):
            os.makedirs(result_path)
        else:
            if overwrite_flag:
                for f in os.listdir(result_path):
                    os.remove(result_path + '/' + f)
            else:
                raise IOError('.csv files with the suffix "'+ suffix + 
                              '" already exist. ' +
                              'Choose new suffix or specify '+
                              'overwrite_flag.')
        
        # If everything is working, write the good stuff.
        for key in self.hit_list.keys():
            with open(result_path + '/' + '^' + key + '_' + suffix +'.csv'
                      , 'w') as f:
                 writer = csv.writer(f, dialect='excel')
                 writer.writerow(csv_headers)
                 writer.writerows(self.hit_list[key])         
             
        
    
[docs]class document_sorter():
    """
    Not implemented yet. Sorry.
    """
    def __init__(self, docket_path='./results/local_docket_archive',
                 document_path='./results/local_document_archive',
                 output_path='./results',
                 searchable_criteria = 'court'):
        self.docket_path = docket_path
        self.document_path = document_path    

        self.searchable_criteria = searchable_criteria

        self.file_index = {}
        self.flags = []
        #Set the output folder for the text conversions        
        x = """self.output_path = (os.path.abspath(output_path) +'/text_document_archive/')
        if not os.path.exists(output_path):
            os.makedirs(os.path.abspath(output_path) + "/local_docket_archive")
            os.makedirs(os.path.abspath(output_path) + "/local_document_archive")
        elif not os.path.exists(output_path+"/local_docket_archive/"):
            os.makedirs(os.path.abspath(output_path) + "/local_docket_archive/")
            if not os.path.exists(output_path+"/local_document_archive/"):
                os.makedirs(os.path.abspath(output_path) + "/local_document_archive/")
        """

[docs]    def convert_PDF_to_text(self, filename):
        """
        Convert a file to text and save it in the text_output_path
        """
        pass

[docs]    def convert_all(self, overwrite=False):
        """
        For files in the document path, use convert_PDF_to_text if it
        has not been converted before. Determine if a file is searchable
        or not.
        """

[docs]    def set_flag(self):
        """
        Add a criteria to the flagging process.
        """
        pass

[docs]    def flag_searchable(self):
        """
        Flag according to self.flags()
        Move files to a folder (make this an option)
        """
        pass

[docs]    def count(self):
        """
        Count the file_index
        """
        pass

[docs]    def export_file_index(self):
        """
        Save the file_index to a file
        """
        pass

[docs]class UTF8Recoder:
    """
    Iterator that reads an encoded stream and reencodes the input to UTF-8
    """
    def __init__(self, f, encoding):
        self.reader = codecs.getreader(encoding)(f)

    def __iter__(self):
        return self

    def next(self):
        return self.reader.next().encode("utf-8")

[docs]class UnicodeReader:
    """
    A CSV reader which will iterate over lines in the CSV file "f",
    which is encoded in the given encoding.
    """

    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
        f = UTF8Recoder(f, encoding)
        self.reader = csv.reader(f, dialect=dialect, **kwds)

    def next(self):
        row = self.reader.next()
        return [unicode(s, "utf-8") for s in row]

    def __iter__(self):
        return self

[docs]class UnicodeWriter:
    """
    A CSV writer which will write rows to CSV file "f",
    which is encoded in the given encoding.
    """

    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
        # Redirect output to a queue
        self.queue = cStringIO.StringIO()
        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
        self.stream = f
        self.encoder = codecs.getincrementalencoder(encoding)()

    def writerow(self, row):
        self.writer.writerow([s.encode("utf-8") for s in row])
        # Fetch UTF-8 output from the queue ...
        data = self.queue.getvalue()
        data = data.decode("utf-8")
        # ... and reencode it into the target encoding
        data = self.encoder.encode(data)
        # write to the target stream
        self.stream.write(data)
        # empty queue
        self.queue.truncate(0)

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)
# class postprocessor():
#    print datetime.datetime.now().strftime(%Y-%m-%d-%H-%M-%S)