SFR-2292: Skip clustering records with no title (#423)

kylevillegas93 · web-flow · commit 66faee28ed4d · 2024-10-30T09:11:03.000-04:00
diff --git a/main.py b/main.py
@@ -76,7 +76,7 @@ def createArgParser():
     parser.add_argument('-o', '--offset',
                         help='Set start offset for current processed (for batched import process)')
     parser.add_argument('-r', '--singleRecord',
-                        help='Single record ID for ingesting an individual record (only applicable for DOAB)')
+                        help='Single record ID for ingesting an individual record')
     parser.add_argument('options', nargs='*', help='Additional arguments')
 
     return parser
diff --git a/processes/cluster.py b/processes/cluster.py
@@ -40,6 +40,8 @@ def runProcess(self):
                 self.cluster_records(full=True)
             elif self.process == 'custom':
                 self.cluster_records(start_datetime=self.ingestPeriod)
+            elif self.process == 'single':
+                self.cluster_records(record_uuid=self.singleRecord)
             else: 
                 logger.warning(f'Unknown cluster process type {self.process}')
         except Exception as e:
@@ -48,7 +50,7 @@ def runProcess(self):
         finally:
             self.close_connection()
 
-    def cluster_records(self, full=False, start_datetime=None):
+    def cluster_records(self, full=False, start_datetime=None, record_uuid=None):
         get_unclustered_records_query = (
             self.session.query(Record)
                 .filter(Record.frbr_status == 'complete')
@@ -65,6 +67,9 @@ def cluster_records(self, full=False, start_datetime=None):
 
             get_unclustered_records_query = get_unclustered_records_query.filter(Record.date_modified > start_datetime)
 
+        if record_uuid:
+            get_unclustered_records_query = get_unclustered_records_query.filter(Record.uuid == record_uuid)
+
         works_to_index = []
         work_ids_to_delete = set()
 
@@ -167,6 +172,11 @@ def find_all_matching_records(self, record: Record):
 
             for matched_record in matched_records:
                 matched_record_title, matched_record_id, matched_record_identifiers = matched_record
+                
+                if not matched_record_title:
+                    logger.warning(f'Matched record with id {matched_record_id} has no title')
+                    continue
+
                 tokenized_matched_record_title = self.tokenize_title(matched_record_title)
 
                 if match_distance > 0 and not self.titles_overlap(tokenized_record_title, tokenized_matched_record_title):