Skip to content

Commit 66faee2

Browse files
SFR-2292: Skip clustering records with no title (#423)
1 parent d1b9e2e commit 66faee2

File tree

2 files changed

+12
-2
lines changed

2 files changed

+12
-2
lines changed

main.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def createArgParser():
7676
parser.add_argument('-o', '--offset',
7777
help='Set start offset for current processed (for batched import process)')
7878
parser.add_argument('-r', '--singleRecord',
79-
help='Single record ID for ingesting an individual record (only applicable for DOAB)')
79+
help='Single record ID for ingesting an individual record')
8080
parser.add_argument('options', nargs='*', help='Additional arguments')
8181

8282
return parser

processes/cluster.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ def runProcess(self):
4040
self.cluster_records(full=True)
4141
elif self.process == 'custom':
4242
self.cluster_records(start_datetime=self.ingestPeriod)
43+
elif self.process == 'single':
44+
self.cluster_records(record_uuid=self.singleRecord)
4345
else:
4446
logger.warning(f'Unknown cluster process type {self.process}')
4547
except Exception as e:
@@ -48,7 +50,7 @@ def runProcess(self):
4850
finally:
4951
self.close_connection()
5052

51-
def cluster_records(self, full=False, start_datetime=None):
53+
def cluster_records(self, full=False, start_datetime=None, record_uuid=None):
5254
get_unclustered_records_query = (
5355
self.session.query(Record)
5456
.filter(Record.frbr_status == 'complete')
@@ -65,6 +67,9 @@ def cluster_records(self, full=False, start_datetime=None):
6567

6668
get_unclustered_records_query = get_unclustered_records_query.filter(Record.date_modified > start_datetime)
6769

70+
if record_uuid:
71+
get_unclustered_records_query = get_unclustered_records_query.filter(Record.uuid == record_uuid)
72+
6873
works_to_index = []
6974
work_ids_to_delete = set()
7075

@@ -167,6 +172,11 @@ def find_all_matching_records(self, record: Record):
167172

168173
for matched_record in matched_records:
169174
matched_record_title, matched_record_id, matched_record_identifiers = matched_record
175+
176+
if not matched_record_title:
177+
logger.warning(f'Matched record with id {matched_record_id} has no title')
178+
continue
179+
170180
tokenized_matched_record_title = self.tokenize_title(matched_record_title)
171181

172182
if match_distance > 0 and not self.titles_overlap(tokenized_record_title, tokenized_matched_record_title):

0 commit comments

Comments
 (0)