-
Notifications
You must be signed in to change notification settings - Fork 20
/
recapupload.py
278 lines (243 loc) · 10.3 KB
/
recapupload.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
# Upload a PDF to the RECAP Archive.
#
# John Hawkinson <[email protected]>
# 29 December 2017
#
# This is kind of rough, because:
#
# * RECAP/CourtListener (CL) wants PACER metadata for each PDF BEFORE
# it receives the PDF
# * BigCases doesn't preserve the exact RSS XML so we can't just feed
# it to RECAP
# * RECAP doesn't have an RSS XML-input endpoint, although it could
# easily gain one
# * So before we can upload a PDF, we have to fake up the metadata in
# the format that CL understands, and transmit it. But only if that
# metadata isn't already there.
# * Because BigCases doesn't preserve the parsed data from the RSS
# feed, we need to use feedparser's date parser to get it.
from HTMLParser import HTMLParser
import feedparser
import re
import requests
import time
import urlparse
from bigcases_settings import settings
API_BASE = 'https://www.courtlistener.com/api/rest/v3/'
TIMEOUTS = (60, 300) # 1 minute connect timeout, 5 min. read timeout
VERBOSE = 1
class _UploadType():
DOCKET = 1
ATTACHMENT_PAGE = 2
PDF = 3
DOCKET_HISTORY_REPORT = 4
APPELLATE_DOCKET = 5
APPELLATE_ATTACHMENT_PAGE = 6
class RecapUpload(object):
"""Upload a document to the RECAP archive."""
def PACER_Court_to_CL(self, PACER_court):
# An unfortunate design decision was made in the past.
PACER_TO_CL_IDS = {
'azb': 'arb', # Arizona Bankruptcy Court
'cofc': 'uscfc', # Court of Federal Claims
'neb': 'nebraskab', # Nebraska Bankruptcy
'nysb-mega': 'nysb', # Remove the mega thing
}
return PACER_TO_CL_IDS.get(PACER_court, PACER_court)
def __init__(self, filename,
docket_number, docket_caption,
published_date,
item_description):
"""Upload a document to the RECAP archive.
We do based on what little information we have. This involves
multiple roundtrips to the RECAP server, unfortunately.
The parameters we get are all from RSS, as available to the
BigCases scraper. Unfortunately that's not the full RSS XML,
because it takes a trip through a SQL database.
PARAMETER: SOURCE
docket_number and docket_caption: item.title
published_date: item.published
item_description: item.description
Outline:
1. We first parse the item_description to get the docket text,
the item URL (including DLS aka "doc1" number), and the
entry number (link anchor).
2. We lookup the case, or determine we need to fake it.
3. We lookup the docket entry, or determine we need to fake it.
4. If necessary, we fake the docket entry and case title.
5. We upload the PDF.
"""
if not len(settings.recap_token) >= 40:
# Not a valid token
return None
# #1. We first parse the item_description to get the docket text,
h = HTMLParser()
itemDecoded = h.unescape(item_description)
# BEFORE unescape() call:
# [Motion For Order] (<a href="https://ecf.dcd.uscourts.gov/
# doc1/04506366063?caseid=190182&de_seq_num=369">94</a>)
# AFTER unescape() call:
# [Motion For Order] (<a href="https://ecf.dcd.uscourts.gov/
# doc1/04506366063?caseid=190182&de_seq_num=369">94</a>)
# NOTE that Appellate takes this form:
# [Order Filed (CLERK)] (<a href='https://ecf.cadc.uscourts.gov/
# docs1/01207988480'>Document</a>)
match = re.search(r'''(?x)
\[(?P<text>[^\]]*)\]
.*?
<a\ href=(?P<q1>["'])(?P<url>.*?)(?P=q1)\s*>
(?P<anchor>.*?)
</a>
''', itemDecoded)
if match is None:
return None
text = match.group('text')
url = match.group('url')
entry_number = match.group('anchor')
match = re.search(r'http.*/docs?1/(?P<dls>\d+)\?', url)
pacer_doc_id = match.group('dls')
if pacer_doc_id[3:4] == '1':
# PACER sites use the fourth digit of the pacer_doc_id to
# flag whether the user has been shown a receipt page. We
# don't care about that, so we always set the fourth digit
# to 0 when getting a doc ID.
pacer_doc_id = pacer_doc_id[0:3] + '0' + pacer_doc_id[4:]
parsed_url = urlparse.urlparse(url)
court = self.PACER_Court_to_CL(parsed_url.hostname.split('.')[1])
qparams = urlparse.parse_qs(parsed_url.query)
pacer_case_id = qparams['caseid'][0]
# #2. We lookup the case, or determine we need to fake it.
need_fake_case = False
need_fake_entry = False
if VERBOSE > 0:
print "Checking RECAP for case %s in %s" % (pacer_case_id, court)
# /dockets/?pacer_case_id=189311&court=mad
r = requests.get(
url=API_BASE+'dockets/',
headers={'Authorization': 'Token %s' % settings.recap_token},
params={'court': court,
'pacer_case_id': pacer_case_id},
timeout=TIMEOUTS,
)
if VERBOSE >= 2:
print "Returns:", r, r.text
rj = r.json()
if (not r.ok) or rj['count'] < 1:
if VERBOSE > 0:
print "RECAP docket doesn't exist. We need to fake one up."
need_fake_case = True
else:
# The CL docket ID, needed for the next query, is here:
# "resource_uri":
# "https://www.courtlistener.com/api/rest/v3/dockets/6125037/",
cl_docket_id = re.search(r'/dockets/(\d+)/',
rj['results'][0]['resource_uri']).group(1)
# #3. We lookup the docket entry, or determine we need to fake it.
if not need_fake_case:
# If we don't have a case, we need not check for a docket entry
if VERBOSE > 0:
print "Checking RECAP for entry %s in CL case %s" \
% (entry_number, cl_docket_id)
# Originally we had:
# /recap-documents/?pacer_case_id=189311&court=mad
# &pacer_doc_id=09508417779
# Or simply
# /recap-documents/?pacer_doc_id=09508417779
# But alternatively
# /docket-entries/?recap_documents__pacer_doc_id=09508346951
# But now Mike's preference:
# /docket-entries/?docket_id=6249495&entry_number=13
# Because in some corner cases there is a Docket Entry
# object without a RECAP docket (...)
r = requests.get(
url=API_BASE+'docket-entries/',
headers={'Authorization': 'Token %s' % settings.recap_token},
params={
'docket': cl_docket_id,
'entry_number': entry_number,
},
timeout=TIMEOUTS,
)
if VERBOSE >= 2:
print "Returns: ", r
rj = r.json()
# Control debugging spew.
# plain_text is a very large amount of text.
try:
plainText = \
rj['results'][0]['recap_documents'][0]['plain_text']
if len(plainText) > 80:
rj['results'][0]['recap_documents'][0]['plain_text'] = \
plainText.strip()[:40] + '...'
except IndexError:
pass
if VERBOSE >= 2:
print rj
if (not r.ok) or rj['count'] < 1:
if VERBOSE > 0:
print "The docket ENTRY doesn't exist. " + \
"We need to fake one up."
need_fake_entry = True
# #4. If necessary, we fake the docket entry and case title.
if need_fake_case or need_fake_entry:
# Either because of the case docket or the docket entry's absence
# we must fake up a docket.
date = time.strftime('%m/%d/%Y',
feedparser._parse_date(published_date))
html = ''
html += '<h3>THIS IS A FAKED UP DOCKET VIA RSS THROUGH '
html += 'recapupload.py<br>\n'
html += '%s</h3>\n' % docket_number
html += '<table>'
if need_fake_case:
html += '<td><br>%s' % docket_caption
# Lack of whitespace after this colon matters!
html += '<td>Date Filed:</table>\n'
html += '<table>'
html += '<tr>'
html += '<td>Date Filed</td><th>#</th><td>Docket Text</td></tr>\n'
html += '<tr>'
html += '<td>%s</td>\n' % date
html += '<td><a href="%s">%s</td>\n' % (url, entry_number)
html += '<td>%s</td>' % text
html += '</tr>\n</tbody></table>\n'
if VERBOSE >= 2:
print "We faked up this: \n", html
files = {'filepath_local': ('filepath_local', html, 'text/html')}
r = requests.post(
url=API_BASE+'recap/',
headers={'Authorization': 'Token %s' % settings.recap_token},
data={
'upload_type': _UploadType.DOCKET,
'court': court,
'pacer_case_id': pacer_case_id,
'debug': 'false', # Server throws away in debug mode
},
files=files,
timeout=TIMEOUTS,
)
if VERBOSE >= 2:
print "Returns: ", r, r.text
if (not r.ok):
if VERBOSE > 0:
print "Failed to upload faked docket entry info to RECAP."
return None
# #5. We upload the PDF.
files = {'filepath_local': open(filename, 'rb')}
r = requests.post(
url=API_BASE+'recap/',
headers={'Authorization': 'Token %s' % settings.recap_token},
data={
'upload_type': _UploadType.PDF,
'court': court,
'pacer_case_id': pacer_case_id,
'pacer_doc_id': pacer_doc_id,
'document_number': entry_number,
'debug': 'false', # Server throws away in debug mode
},
files=files,
timeout=TIMEOUTS,
)
if VERBOSE > 0:
print "RECAP upload returns: ", r, r.text
return None