Skip to content

Commit 4d7a0aa

Browse files
SFR-2216: Fixing LOC process ingestion (#392)
1 parent 596efd4 commit 4d7a0aa

File tree

3 files changed

+28
-10
lines changed

3 files changed

+28
-10
lines changed

mappings/loc.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@ def applyFormatting(self):
3333
self.record.source = 'loc'
3434
if self.record.medium:
3535
self.record.medium = self.record.medium[0]
36-
if len(self.record.is_part_of) == 0:
36+
if self.record.is_part_of and len(self.record.is_part_of) == 0:
3737
self.record.is_part_of = None
38-
if len(self.record.abstract) == 0:
38+
if self.record.abstract and len(self.record.abstract) == 0:
3939
self.record.abstract = None
4040

4141
#Convert string repr of list to actual list
@@ -62,6 +62,9 @@ def formatIdentifierSourceID(self, itemList):
6262
lccnNumber = self.record.identifiers[0][0] #lccnNumber comes in as an array and we need the string inside the array
6363
sourceID = lccnNumber
6464
if 'call_number' in newIdentifier.keys():
65+
if not isinstance(newIdentifier['call_number'], list):
66+
newIdentifier['call_number'] = list(newIdentifier['call_number'])
67+
6568
newIdentifier['call_number'][0] = f'{newIdentifier["call_number"][0]}|call_number'
6669
callNumber = newIdentifier['call_number'][0].strip(' ')
6770
else:
@@ -77,7 +80,7 @@ def formatPubSpatial(self, itemList):
7780
if ':' not in elem:
7881
createdPublishedList = elem.split(',', 1)
7982
pubLocation = createdPublishedList[0].strip(' ')
80-
if ',' in createdPublishedList[1]:
83+
if len(createdPublishedList) >= 2 and ',' in createdPublishedList[1]:
8184
pubOnly = createdPublishedList[1].split(',')[0].strip(' ')
8285
pubArray.append(pubOnly)
8386
spatialString = pubLocation

processes/loc.py

+15-4
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ def runProcess(self):
5151

5252
self.saveRecords()
5353
self.commitChanges()
54+
55+
logger.info(f'Ingested {len(self.records)} LOC records')
5456

5557

5658
def importLOCRecords(self, startTimeStamp=None):
@@ -83,6 +85,9 @@ def importOpenAccessRecords(self, count, customTimeStamp):
8385
# An HTTP error will occur when the sp parameter value
8486
# passes the last page number of the collection search reuslts
8587
while sp < 100000:
88+
if self.ingestLimit and count >= self.ingestLimit:
89+
break
90+
8691
openAccessURL = '{}&sp={}'.format(LOC_ROOT_OPEN_ACCESS, sp)
8792
jsonData = self.fetchPageJSON(openAccessURL)
8893
LOCData = jsonData.json()
@@ -129,6 +134,9 @@ def importDigitizedRecords(self, count, customTimeStamp):
129134
# An HTTP error will occur when the sp parameter value
130135
# passes the last page number of the collection search reuslts
131136
while sp < 100000:
137+
if self.ingestLimit and count >= self.ingestLimit:
138+
break
139+
132140
digitizedURL = '{}&sp={}'.format(LOC_ROOT_DIGIT, sp)
133141
jsonData = self.fetchPageJSON(digitizedURL)
134142
LOCData = jsonData.json()
@@ -170,14 +178,17 @@ def processLOCRecord(self, record):
170178
try:
171179
LOCRec = LOCMapping(record)
172180
LOCRec.applyMapping()
181+
182+
if LOCRec.record.authors is None:
183+
logger.warning(f'Unable to map author in LOC record {LOCRec.record} ')
184+
return
185+
173186
self.addHasPartMapping(record, LOCRec.record)
174187
self.storePDFManifest(LOCRec.record)
175188
self.storeEpubsInS3(LOCRec.record)
176189
self.addDCDWToUpdateList(LOCRec)
177-
178-
except (MappingError, HTTPError, ConnectionError, IndexError, TypeError) as e:
179-
logger.exception(e)
180-
logger.warn(LOCError('Unable to process LOC record'))
190+
except Exception:
191+
logger.exception(f'Unable to process LOC record')
181192

182193
def addHasPartMapping(self, resultsRecord, record):
183194
if 'pdf' in resultsRecord['resources'][0].keys():

tests/unit/test_loc_process.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import pytest
22

33
from mappings.core import MappingError
4+
from model import Record
45
from processes.loc import LOCProcess
56
from tests.helper import TestHelpers
67

@@ -49,16 +50,19 @@ def test_processLOCRecord_success(self, testProcess, mocker):
4950
addDCDWToUpdateList=mocker.DEFAULT
5051
)
5152

52-
mockMapping = mocker.MagicMock(record='testRecord')
53+
test_record = Record(authors=[])
54+
55+
mockMapping = mocker.MagicMock()
56+
mockMapping.record = test_record
5357
mockMapper = mocker.patch('processes.loc.LOCMapping')
5458
mockMapper.return_value = mockMapping
5559

5660
testProcess.processLOCRecord(mockMapping)
5761

5862
mockMapping.applyMapping.assert_called_once()
5963

60-
processMocks['addHasPartMapping'].assert_called_once_with(mockMapping, 'testRecord')
61-
processMocks['storePDFManifest'].assert_called_once_with('testRecord')
64+
processMocks['addHasPartMapping'].assert_called_once_with(mockMapping, test_record)
65+
processMocks['storePDFManifest'].assert_called_once_with(test_record)
6266
processMocks['addDCDWToUpdateList'].assert_called_once_with(mockMapping)
6367

6468
def test_processlocRecord_error(self, mocker):

0 commit comments

Comments
 (0)