Source code for pacer_lib.scraper

import codecs
import cStringIO
import csv
import datetime
import json
import os
import requests
import re
import time
from bs4 import BeautifulSoup, Comment
#import lxml
#Implement logging in the search_agent --> initiate a timestamped logfile (init), initiate an error variable (init) create a function that writes to the file when there is an error, make any query or request store errors and make errors write to that file
#Implement mySQL stuff
[docs]class search_agent(): """ Returns a ``search_agent()`` object, that serves as an interface for the PACER case locator. It will query and download both dockets and documents. It is a modified requests.sessions object. **Keyword Arguments** * ``username``: a valid PACER username * ``password``: a valid PACER password that goes with ``username`` * ``output_path``: allows you to specify the relative path where you would like to save your downloads. The actual docket sheets will be saved to a subfolder within output_path, '/local_docket_archive/'. If the folders do not exist, they will be created. * ``auto_login``: specify if you would like to login when the object is instantiated (you may want to use ``search_agent()`` to create PACER query strings). * ``wait_time``: how long to wait between requests to the PACER website. """ def __init__(self, username, password, output_path ='./results', auto_login=True, wait_time=1): #ATTRIBUTES: username, password, br (browser), self.username = username self.password = password self.wait_time = 1 #Login to PACER unless told otherwise. self.br = '' if auto_login: self.refresh_login() #Check for save folders, and if missing, create them. # Check that the folders "/results/" and "local_docket_archive" # and "local_document_archive" exists if not os.path.exists(output_path): os.makedirs(os.path.abspath(output_path) + "/local_docket_archive") os.makedirs(os.path.abspath(output_path) + "/local_document_archive") elif not os.path.exists(output_path+"/local_docket_archive/"): os.makedirs(os.path.abspath(output_path) + "/local_docket_archive/") if not os.path.exists(output_path+"/local_document_archive/"): os.makedirs(os.path.abspath(output_path) + "/local_document_archive/") self.output_path = os.path.abspath(output_path)
[docs] def refresh_login(self): """ Logs in to the PACER system using the login and password provided at the initialization of ``search_agent()``. This will create a Requests session that will allow you to query the PACER system. If *auto_login* =False, ``refresh_login()`` must be called before you can query the case_locator. This function will raise an error if you supply an invalid login or password. Returns nothing. """ #SETTINGS (determined from the form from PACER's '/login.pl') login_url = 'https://pacer.login.uscourts.gov/csologin/login.jsf' self.br = requests.Session() login_html = self.br.get(login_url) soup = BeautifulSoup(login_html.text) temp = soup.find('button', {'onclick' : 'barWaitDialog.show();'}, text='Login') button = temp['id'] payload = {'login':'login', 'login:clientCode':'', button:'', 'login:loginName':self.username, 'login:password':self.password, 'javax.faces.ViewState':'stateless'} response = self.br.post(login_url, data=payload) if "Invalid ID or password" in response.text: raise ValueError("Invalid ID or password") if "Too many failed login attempts. Account Locked" in response.text: raise BaseException("Login Locked")
[docs] def query_case_locator(self, payload): """ Returns a string literal of the HTML of the search results page. This function passes queries to the PACER Case Locator (https://pcl.uscourts.gov/dquery) and this is the simplest interface (you can send any key:value pairs as a POST request). We do not recommend using this unless you want more advanced functionality. **Keyword Arguments** * ``payload``: key-value pairs that will be converted into a POST request. """ #Locator locator_url = 'https://pcl.uscourts.gov/dquery' # 1. INITIALIZATION CHECKS # Check if search_agent has started a PACER session if not self.br: self.refresh_login() # Check the POST data if type(payload) is not dict: raise TypeError("'payload' must be a dictionary.") if 'case_no' not in payload: print "Warning: You are not searching for a case number." response = self.br.post(locator_url, data=payload) time.sleep(self.wait_time) return response.text
[docs] def search_case_locator(self, case_no, other_options={'court_type':'all', 'default_form':'b'}): """ Passes a query to the PACER Case Locator and returns a list of search results (as well as error message, if applicable). Returns two objects, a list (*results*) and a string that indicates if there was an error. **Keyword Arguments** * ``case_no``: a string that represents a PACER query. * ``other_options``: allows you to determine the payload sent to ``query_case_locator()``. This is validated in ``search_case_locator()`` so that you only pass known valid POST requests. The default options are those known to be necessary to get search results. **Output Documentation** Each search result is a dictionary with these keys: * ``searched_case_no`` * ``result_no`` * ``case_name`` * ``listed_case_no`` * ``court_id`` * ``nos`` * ``date_filed`` * ``date_closed`` * ``query_link`` The second object returned is a string that verbosely indicates errors that occured. If the search result was found, the string is empty. """ # 1. CHECK FOR VALID ARGUMENTS AND PREPARE PAYLOAD payload = other_options # Are these really the only form options? form_options = ["all_region","case_no","court_type","date_discharge_end","date_discharge_start","date_dismiss_end","date_dismiss_start","date_filed_end","date_filed_start","date_term_end","date_term_start","default_form","mdl_id","party","ssn","ssn4","stitle"] # Make sure that all of the options are valid. for key in payload.keys(): if key not in form_options: raise KeyError("'" + key + "' is not a form option.") #Add the case_no into the payload payload['case_no'] = str(case_no) # Ensure that we can query. if 'court_type' not in payload: payload['court_type'] = 'all' if 'default_form' not in payload: payload['default_form'] ='b' # Check if case_no is in the correct format #???? THERE ARE MULTIPLE FORMATS FOR THIS # 2. QUERY PACER CASE LOCATOR source = self.query_case_locator(payload) # Simple error checking of the results. if "Invalid case number" in source: return [], "ERROR: Invalid Case Number" elif "No records found" in source: return [], "No Search Results" # Pass to BeautifulSoup HTML Parser source_code = BeautifulSoup(source) # Identify the results table and create a list of search results results_table = source_code.find('table', {'align':'center'}) if results_table: search_results = results_table.findAll('tr') else: return [], "ERROR: No Results Table" results = [] for result in search_results: # Skip the row with column headers. if not result.find('td', {'class':'court_id'}): continue # Pull out the cell information. case_info = {} case_info['searched_case_no'] = case_no case_info['result_no'] = result.find('td', {'class':'line_no'}).string case_info['case_name'] = result.find('td', {'class':'cs_title'}).string case_info['listed_case_no'] = result.find('td', {'class':'case'}).a.string case_info['court_id'] = result.find('td', {'class':'court_id'}).string # Temporary Fix # Note that there is also a disposition variable in bankruptcy results that we can pull if 'bk' in case_no: case_info['nos'] = result.find('td', {'class':'cs_chapter'}).string else: case_info['nos'] = result.find('td', {'class':'nos'}).string case_info['link'] = result.find('td', {'class':'case'}).a.get('href') # Handle the Dates dates= result.find_all('td', {'class':'cs_date'}) if not dates[0].string: case_info['date_filed'] = "None" else: case_info['date_filed'] = dates[0].string if not dates[1].string: case_info['date_closed'] = "None" else: case_info['date_closed'] = dates[1].string results.append(case_info) return results, ""
[docs] def request_docket_sheet(self, docket_link, other_options={}): """ Returns the HTML of the docket sheet specified by *docket_link*. You can also pass additional POST requests through *other_options*. """ # Pull up the docket report generator page response = self.br.get(docket_link) # Identify the Query Token temp_search = re.search('DktRpt.pl\?(.*)"',response.text) if temp_search: query_value = temp_search.group(1) else: raise ValueError('Unable to parse the docket report generator page') # Identify the Case Value temp_search = re.search('DktRpt.pl\?(\d*)', docket_link) if temp_search: case_value = temp_search.group(1) else: raise ValueError('Bad Link') #Set Default Values if no other_options payload = other_options if not payload: payload = {'date_range_type':'Filed', 'list_of_member_cases':'on', 'list_of_parties_and_counsel':'on', 'terminated_parties':'on', 'pdf_header':'1', 'output_format':'html', 'sort1':'oldest date first'} # Set case dependent options payload['all_case_ids'] = case_value payload['CaseNum_' + case_value] = 'on' # Request the docket report response = self.br.post(docket_link.replace(case_value, query_value), data=payload) # Sometimes, PACER gives you a "many docket entries" page if "</form>" in response.text.lower(): raise SystemError("Many Docket Entries; Charles needs to code in this exception") __ = """"temp_search = re.search('DktRpt.pl\?(.*)"',response.text) query_value_many = temp_search.group(1) if not query_value_many: payload_many = {'date_from':''} response = self.br.post(docket_link.replace(case_value, query_value_many), data=payload_many)""" return response.text
[docs] def request_document(self, case_filename, document_link, other_options={}): """ Using a case_filename and a link to the document, this function constructs the necesssary POST data and finds the correct document URL to download the specified PDF document. Returns binary data. You can also pass additional POST requests through *other_options*. (For version 2.1) Currently only implemented for district courts, but should eventually be implemented for bankruptcy and appellate courts. """ __original_link = document_link # 0. Check that the case_filenumber is in the correct format. case_filename_search = re.search('([a-zA-Z]{5,6})_(\d)\+(\d\d)-' '([a-zA-Z]{2})-(\d{1,5})', case_filename.lower()) if case_filename_search: court_id = case_filename_search.group(1) court_short_id = court_id.replace('ce', '').replace('ke','') else: raise ValueError('Bad case_filename') # 1. Identify case_id, de_seq_num and if it is a single-file document. de_seq_num = '' case_id = '' single_file = True # Method 1: Try finding both of the variables in the link. case_id_search = re.search('caseid=(\d*)', document_link) if case_id_search: case_id = case_id_search.group(1) de_seq_search = re.search('de_seq_num=(\d*)', document_link) if de_seq_search: de_seq_num = de_seq_search.group(1) # Method 2: Find it in the HTML of the link. else: r = self.br.get(document_link) temp_soup = BeautifulSoup(r.text) post_data = temp_soup.find('form') # If there is a form, look at the 'onsumbit' attribute. if post_data: if post_data['onsubmit']: search = re.search("goDLS\('/doc1/\d*','(?P<case_id>\d*)'" +",'(?P<de_seq_num>\d*)'", post_data['onsubmit']) if search: case_id = search.group('case_id') de_seq_num = search.group('de_seq_num') else: #If there isn't a form, then this is a multi-file document. single_file = False #Look for a "View All" Button post_data = temp_soup.find('input', {'value':'View All'}) if post_data: if post_data['onclick']: url = post_data['onclick'] url = url.replace('\'', '') url = url.replace('parent.location=', '') document_link = ("https://ecf."+ court_short_id + ".uscourts.gov" + url) case_id_search = re.search('caseid=(\d*)', document_link) de_seq_search = re.search('arr_de_seq_nums=(\d*)', document_link) if case_id_search and de_seq_search: case_id = case_id_search.group(1) de_seq_num = de_seq_search.group(1) # Check if we identified the case_id or not. if not case_id or not de_seq_num: if document_link == __original_link: raise ValueError('Could not identify case_id or de_seq_num ' + 'from \n\'' + document_link + '\'' ) else: raise ValueError('Multi-part document. Could not identify ' + 'case_id or de_seq_num from \n\'' + document_link + '\' or \n\'' + __original_link +' \'') # 2. Encode the POST Request #Default values payload = {'caseid' : case_id, 'got_receipt' : '1', 'pdf_header' : '2', 'pdf_toggle_possible' : '1'} if single_file: payload['de_seq_num'] = de_seq_num, else: payload['arr_de_seq_nums'] = de_seq_num payload.update(other_options) # 3. Find the final download link from the'charge' page. # (there used to be a retry if cannot open, but removed, for now) response = self.br.get(document_link) print "continue" # Parse the 'charge' page to find the 'viewdoc' url that will request # the actual document. temp_soup = BeautifulSoup(response.text) viewdoc_url = temp_soup.find('form', {'action':True}) # If there is no form on this page, then we have found another # intermediate multi-file document page. Follow the links. if not viewdoc_url: multipage_viewdoc_url = temp_soup.find('a', {'onclick':True}) temp_response = self.br.post(multipage_viewdoc_url.get('href')) temp_soup = BeautifulSoup(temp_response.text) #We should now be at the "Accept Charges Page" viewdoc_url = temp_soup.find('form', {'action':True}) # Pull out the final document URL. doc_url = viewdoc_url.get('action') if "http" in doc_url: document_url = doc_url else: document_url = ("https://ecf." + court_short_id + ".uscourts.gov" + doc_url) # 4. Post to the URL from step 3 with the post_data from part 2. document = self.br.post(document_url, data=payload) # The PDF might be embedded in an iframe. iframes = re.search("/cgi-bin/show_temp\.pl\?file=.*=application/pdf", document.text) if iframes: iframe_src = ("https://ecf." + court_short_id + ".uscourts.gov" + iframes.group(0)) document= self.br.post(iframe_src, data=payload) # Return the content of the document return document.content
[docs] def download_case_docket(self, case_no, court_id, other_options={'court_type':'all','default_form':'b'}, overwrite=False): """ Returns a list that indicates the case_no, court_id and any error. ``download_case_docket`` also writes the .html docket sheet to *self.output_path* (in a subfolder '/local_docket_archive/'. If you set *overwrite*=True, it will overwrite previous dockets. Otherwise, ``download_case_docket`` will check to see if the docket has already been downloaded **before** incurring any additional search or download charges. You can also pass additional POST requests through *other_options*. """ # ':' is not an acceptable character for Windows filenames, so we replace it with '+'. In older versions, we replaced it with '_'. docket_filepath = (self.output_path + "/local_docket_archive/"+ court_id + '_' + case_no.replace(':', '+').strip() +'.html') # 0. Check if this docket has already been downloaded if overwrite is False: if os.path.exists(docket_filepath) or \ os.path.exists(docket_filepath.replace('+','_')): return [case_no, court_id, "Docket already downloaded"] # 1. Search PACER Case locator using case_no results, error = self.search_case_locator(case_no, other_options) # 2. IDENTIFY THE CORRECT RESULT if not results: return [case_no, court_id, error] else: correct_result = {} for result in results: # Iterate through the results and identify the case from the right court if court_id == result['court_id']: correct_result = result break else: return [case_no, court_id, 'No cases correspond to this case number in that court'] # 3. DOWNLOAD THE DOCKET SHEET # Convert the link to a direct link to the docket report docket_link = correct_result['link'].replace('iqquerymenu', 'DktRpt') if not docket_link: return [case_no, court_id, "No links to this case."] # Add the header (JSON-object) detailed_info = ("<!--detailed_info:\n{" + "'searched_case_no':'" + correct_result['searched_case_no'] + "',"+ "'result_no':'" + correct_result['result_no'] + "'," + "'case_name':'" + correct_result['case_name'] + "'," + "'listed_case_no':'" + correct_result['listed_case_no'] + "'," + "'court_id':'" + correct_result['court_id'] + "'," + "'nos':'" + correct_result['nos'] + "'," + "'link':'"+ correct_result['link'] + "'," + "'date_filed':'"+ correct_result['date_filed'] + "'," + "'date_closed':'"+ correct_result['date_closed'] + "'," + "'downloaded':'" + datetime.datetime.now().strftime('%Y-%m-%d,%H:%M:%S') + "'}" + "-->\n") # Save the docket sheet. output = BeautifulSoup(detailed_info + self.request_docket_sheet(docket_link)) with codecs.open(docket_filepath, 'w', encoding='utf-8') as f: f.write(output.prettify()) # Manually double check if case_no not in output.prettify(): return [case_no, court_id, "WARNING: Docket Downloaded, but manually double check downloaded docket"] return [case_no, court_id, "Docket Downloaded"]
[docs] def download_document(self, case_filename, doc_no, doc_link, no_type='U', overwrite=False): """ Returns a list that indicates the case_name, doc_no and any error. ``download_case_document`` also writes the .pdf document to *self.output_path* (to the sub-folder '/local_document_archive/'. If you set *overwrite*=True, it will overwrite previously downloaded documents. Otherwise, ``download_case_document`` will check to see if the docket has already been downloaded **before** incurring any additional search or download charges. (To be implemented) docket_parser() assigns two types of numbers: the listed docket number (i.e., the number listed on the page) and the unique identifier (i.e., the position of the docket entry on the page). We should default to using the unique identifier, but all of the legacy files will be using the listed identifier and we will need to reassociate / convert those documents to their unique identifier. no_type = 'U' --> unique identifier no_type = 'L' --> listed identifier We have begun implementing this, but this is not completely finished. Using the listed identifier should be considered legacy and not advised. This will be dangerous in terms of redundant download protection. Document this properly once we finish. (Not implemented) You can also pass additional POST requests through *other_options*. """ # 0. Check for valid inputs # Check that the case_filenumber is in the correct format. case_filename_search = re.search('([a-zA-Z]{5,6})_(\d)\+(\d\d)-' '([a-zA-Z]{2})-(\d{1,5})', case_filename.lower()) if case_filename_search: court_id = case_filename_search.group(1) court_short_id = court_id.replace('ce', '').replace('ke','') else: raise ValueError('Bad case_filename') # check that no_type is valid if no_type.upper() != 'U' and no_type.upper() != 'L' : raise ValueError('Bad no_type. Must be \'U\' or \'L\'') else: no_type = no_type.upper() # ':' is not an acceptable character for Windows filenames, so we # replace it with '+'. In older versions, we replaced it with '_'. # We also prefix the document number with a 'U' if we are using # the unique identifier. if no_type == 'L': doc_filepath = (self.output_path + "/local_document_archive/"+ case_filename + "_document_" + str(doc_no) +'.pdf') elif no_type == 'U': doc_filepath = (self.output_path + "/local_document_archive/"+ case_filename + "_document_U" + str(doc_no) +'.pdf') # 0. Check if this document has already been downloaded if overwrite is False: if os.path.exists(doc_filepath) or \ os.path.exists(doc_filepath.replace('+','_')): return [case_filename, doc_no, "Document already downloaded"] # 1. Format the download link. if "http" in doc_link: document_link = doc_link else: document_link = ("https://ecf." + court_short_id + ".uscourts.gov" + doc_link) # 2 Pass the download link to request_document() output = self.request_document(case_filename, document_link) # 3. Save the output. if output: with open(doc_filepath, 'w') as f: f.write(output) return [case_filename, doc_no, "Document downloaded"] else: return [case_filename, doc_no, "ERROR: Nothing Downloaded"]
[docs]def disaggregate_docket_number(combined_docket_number): """ Returns a string that indicates the year of the case and the PACER-valid case_id. Disaggregates the year from the case number when we have combined docket numbers. Combined year and case numbers are often stored as integers, but this leads to the truncation of leading zeroes. We restore these leading zeroes and then return the two-digit year of the case and the case_id. The minimum number of digits for this function is five (which assumes that the case was from 2000). If there are further truncations (e.g., '00-00084' stored as '0000084' and truncated to '84'), pre-process your case-numbers. """ #Force the docket number to a string. combined_docket_number = str(combined_docket_number) if len(combined_docket_number) == 6: combined_docket_number = '0' + combined_docket_number elif len(combined_docket_number) == 5: combined_docket_number = '00' + combined_docket_number if len(combined_docket_number) != 7: raise ValueError ('The docket_number must have either 5, 6 or 7 ' + 'digits.') year = combined_docket_number[0:2] case_id = combined_docket_number[2:7] return year, case_id
[docs]def gen_case_query(district, office, year, docket_number, type_code, district_first=True): """ Creates a PACER query from the district, office, year, case_id and case_type and returns a tuple of (case_id, court_id, region). PACER case-numbers can be generated by consolidating the district, office, year, case id and case type information in a specific way. This function formats the district name and type_code correctly and then combines the case identifying information into a single PACER query. Many other data sources list the district of the court before the state, e.g., EDNY rather than NYED. If this is not the case, turn off the district_first option. **Keyword Arguments** * ``year`` should be either 2 digits (e.g., 00) or 4 digits (e.g., 1978). * ``case_id`` should be exactly 5digits * ``type code`` must be one of the following: civil, civ, criminal, crim, bankruptcy, bank, cv, cr, bk Returns a tuple (case_number, court_id) (For Version 2.1) Note: Appellate Courts have not been implemented yet. Some of this functionality may not be necessary and should be revisited. Specifically, year can be 2 or 4 digits and case number does not have to be exactly 5 digits (up to 5 digits). Office must be exactly 1 digit. We could also consider including the specific sate in the output. We should also create a list of all valid courtids and check against it. """ type_code_dict = {'civil':'cv', 'civ': 'cv', 'criminal': 'cr', 'crim': 'cr', 'bankruptcy': 'bk', 'bank': 'bk'} state_to_code = {"alaska": "ak", "alabama": "al", "arkansas": "ar", "arizona": "az", "california": "ca", "colorado": "co", "connecticut": "ct", "delaware": "de", "district of columbia": "dc", "florida": "fl", "georgia": "ga", "hawaii": "hi", "iowa": "ia", "idaho": "id", "illinois": "il", "indiana": "in", "kansas": "ks", "kentucky": "ky", "louisiana": "la","maine": "me", "maryland": "md", "massachusetts": "ma", "michigan": "mi", "minnesota": "mn", "mississippi": "ms", "missouri": "mo", "montana": "mt", "nebraska": "ne", "nevada": "nv", "new hampshire": "nh", "new jersey": "nj", "new mexico": "nm", "new york": "ny", "north carolina": "nc", "north dakota": "nd", "northern mariana islands": "nmi", "ohio": "oh", "oklahoma": "ok", "oregon": "or", "pennsylvania": "pa", "puerto rico": "pr", "rhode island": "ri", "south carolina": "sc", "south dakota": "sd", "tennessee": "tn", "texas": "tx", "utah": "ut", "vermont": "vt", "virgin islands": "vi", "virginia": "va", "washington": "wa", "west virginia": "wv", "wisconsin": "wi", "wyoming": "wy"} district_dict = {'northern district': 'nd', 'southern district': 'sd', 'eastern district': 'ed', 'western district': 'wd', 'middle district': 'md', 'central district': 'cd', 'northern bankruptcy': 'nb', 'southern bankruptcy': 'sb', 'eastern bankruptcy': 'eb', 'western bankruptcy': 'wb', 'middle bankruptcy': 'mb', 'central bankruptcy': 'cb'} states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"] district_court_ids = ("almdce", "alndce", "alsdce", "akdce", "azdce", "aredce", "arwdce", "cacdce", "caedce", "candce", "casdce", "codce", "ctdce", "dedce", "dcdce", "flmdce", "flndce", "flsdce", "gamdce", "gandce", "gasdce", "gudce", "hidce", "iddce", "ilcdce", "ilndce", "ilsdce", "inndce", "insdce", "iandce", "iasdce", "ksdce", "kyedce", "kywdce", "laedce", "lamdce", "lawdce", "medce", "mddce", "madce", "miedce", "miwdce", "mndce", "msndce", "mssdce", "moedce", "mowdce", "mtdce", "nedce", "nvdce", "nhdce", "njdce", "nmdce", "nyedce", "nyndce", "nysdce", "nywdce", "ncedce", "ncmdce", "ncwdce", "nddce", "nmidce", "ohndce", "ohsdce", "okedce", "okndce", "okwdce", "ordce", "paedce", "pamdce", "pawdce", "prdce", "ridce", "scdce", "sddce", "tnedce", "tnmdce", "tnwdce", "txedce", "txndce", "txsdce", "txwdce", "utdce", "vtdce", "vidce", "vaedce", "vawdce", "wawdce", "wvndce", "wvsdce", "wiedce", "wiwdce", "wydce") bankruptcy_court_ids = ("almbke", "alnbke", "alsbke", "akbke", "azbke", "arebke", "arwbke", "cacbke", "caebke", "canbke", "casbke", "cobke", "ctbke", "debke", "dcbke", "flmbke", "flnbke", "flsbke", "gambke", "ganbke", "gasbke", "gubke", "hibke", "idbke", "ilcbke", "ilnbke", "ilsbke", "innbke", "insbke", "ianbke", "iasbke", "ksbke", "kyebke", "kywbke", "laebke", "lambke", "lawbke", "mebke", "mdbke","mabke", "miebke", "miwbke", "mnbke", "msnbke", "mssbke", "moebke", "mowbke", "mtbke", "nebke", "nvbke", "nhbke", "njbke", "nmbke", "nyebke", "nynbke", "nysbke", "nywbke", "ncebke", "ncmbke", "ncwbke", "ndbke", "nmibke", "ohnbke", "ohsbke", "okebke", "oknbke", "okwbke", "orbke", "paebke", "pambke", "pawbke", "prbke", "ribke", "scbke", "sdbke", "tnebke", "tnmbke", "tnwbke", "txebke", "txnbke", "txsbke", "txwbke", "utbke", "vtbke", "vibke", "vaebke", "vawbke", "waebke", "wawbke", "wvnbke", "wvsbke", "wiebke", "wiwbke", "wybke") # 0. PRE-PROCESS: Force all inputs to string type. district = str(district) office = str(office) year = str(year) docket_number = str(docket_number) type_code = str(type_code) # 1. PROCESS THE TYPE-CODE # Convert the type code. if type_code.lower() in type_code_dict.keys(): type_code=type_code_dict[type_code] # Check if the user has inputed a valid type_code. # ('ap' has not been implemented') if type_code not in ('cr', 'cv', 'bk'): raise ValueError ('Invalid type-code.') #Using the type-code, we determine the correct suffix. if type_code in ('cr', 'cv'): suffix = 'ce' elif type_code in 'bk': suffix = 'ke' #elif type_code in 'ap': #Appellate Cases not yet implemented. # 2. PROCESS THE DISTRICT NAME district=district.lower().replace(' ','') court_id='' # For fully written out district names, we attempt to # convert this into a useable PACER abbreviation. if len(district) > 4: # Find the state. for key in state_to_code: if key.replace(' ', '') in district: court_id = state_to_code[key] break if not court_id: raise ValueError ("Invalid 'district' input;" + " could not determine the state in \"" + district +"\"") #Find the district within the state. for key in district_dict: if key.replace(' ', '') in district: court_id = court_id + district_dict[key]+suffix break #If the district is not fully written out, one last try. if len(court_id) == 2: if 'district' in district and 'columbia' not in district: court_id = court_id+'d'+suffix elif 'bankruptcy' in district: court_id = court_id+'b'+suffix if len(court_id) < 5 or len(court_id) > 6: raise ValueError ("Invalid 'district' input;" + " could not determine the district in \"" + district +"\"") #Process abbreviations. if district_first: # The program converts the district input into a court_id. if district.startswith('dc') and len(district) == 4: # Single-district districts courts. court_id = district[2:4]+'dce' elif len(district) == 4: # Reverse the order of 4-character district codes court_id = district[2:4]+district[0:2]+suffix elif len(district) == 3: # Reverse the order of 3-character codes court_id = district[1:3]+district[0:1]+suffix else: # The program converts the district input into a court_id. if district.endswith('dc') and len(district)==4: # Single-district districts courts. court_id = district[0:2] + 'dce' elif len(district) == 4 or len(district) == 3: # Append the suffix to 3 or 4 letter districts. court_id = district + suffix #Raise an error if we have not recognized the court. if not court_id: raise ValueError("Invalid 'district' input;" + " could not recognize \"" + district + "\"") # 3. PROCESS YEAR if len(year) == 4: year = year[2:4] if len(year) != 2: raise ValueError("Invalid 'year' input;" + "could not recognize \"" + year + "\"") # 4. SANITY CHECKS if len(office) > 1: raise ValueError("'office' cannot be more than 1 character") if len(type_code) > 2: raise ValueError("Final 'type-code' cannot be more than 2 characters") if len(docket_number) > 5: raise ValueError("'docket_number' cannot be more than 5 characters") if type_code == "cv" or type_code == "cr": if court_id not in district_court_ids: raise ValueError("'" + court_id + "' is an invalid district " + "court_id.") elif type_code == "br": if court_id not in bankruptcy_court_ids: raise ValueError("'" + court_id + "' is an invalid bankruptcy " + "court_id.") if court_id[0:2].upper() not in states: raise ValueError("'" + court_id + "' is not from a valid state/region") # 5. COMBINE INFORMATION # Return both the court_id and the case_number as a tuple. case_no = office + ":" + year + "-" + type_code + "-" + docket_number if type_code != 'ap': region = court_id[0:2].upper() return (case_no, court_id, region)