9
9
from logger import create_log
10
10
from mappings .publisher_backlist import PublisherBacklistMapping
11
11
from managers import S3Manager , WebpubManifest
12
+ from services .ssm_service import SSMService
13
+ from services .google_drive_service import GoogleDriveService
12
14
from .source_service import SourceService
13
15
from managers import DBManager , ElasticsearchManager
14
16
from elasticsearch_dsl import Search , Q
20
22
class PublisherBacklistService (SourceService ):
21
23
def __init__ (self ):
22
24
25
+ self .ssm_service = SSMService ()
23
26
self .s3_manager = S3Manager ()
24
27
self .s3_manager .createS3Client ()
25
- self .s3_bucket = os .environ ['FILE_BUCKET' ]
26
- self .prefix = 'manifests/publisher_backlist'
28
+ self .file_bucket = os .environ ['FILE_BUCKET' ]
29
+ self .drive_service = GoogleDriveService ()
30
+ self .manifest_prefix = 'manifests/publisher_backlist'
31
+ self .title_prefix = 'titles/publisher_backlist'
27
32
self .db_manager = DBManager ()
28
33
self .db_manager .generateEngine ()
29
34
self .es_manager = ElasticsearchManager ()
30
35
self .es_manager .createElasticConnection ()
31
-
32
- self .airtable_auth_token = os .environ .get ('AIRTABLE_KEY' , None )
36
+
37
+ if os .environ ['ENVIRONMENT' ] == 'production' :
38
+ self .airtable_auth_token = self .ssm_service .get_parameter ('arn:aws:ssm:us-east-1:946183545209:parameter/drb/production/airtable/pub-backlist/api-key' )
39
+ else :
40
+ self .airtable_auth_token = self .ssm_service .get_parameter ('arn:aws:ssm:us-east-1:946183545209:parameter/drb/qa/airtable/pub-backlist/api-key' )
33
41
34
42
def delete_records (
35
43
self ,
36
44
limit : Optional [int ]= None
37
45
):
38
46
filter_by_formula = self .build_filter_by_formula_parameter (deleted = True )
39
-
47
+
40
48
array_json_records = self .get_records_array (limit , filter_by_formula )
41
49
42
50
for json_dict in array_json_records :
@@ -46,14 +54,14 @@ def delete_records(
46
54
record_metadata_dict = records_value ['fields' ]
47
55
self .delete_manifest (self .db_manager , record_metadata_dict )
48
56
self .delete_work (record_metadata_dict )
49
-
57
+
50
58
def delete_manifest (self , record_metadata_dict ):
51
59
self .db_manager .createSession ()
52
60
try :
53
61
record = self .db_manager .session .query (Record ).filter (Record .source_id == record_metadata_dict ['DRB Record_ID' ]).first ()
54
62
if record :
55
63
key_name = self .get_metadata_file_name (record , record_metadata_dict )
56
- self .s3_manager .s3Client .delete_object (Bucket = self .s3_bucket , Key = key_name )
64
+ self .s3_manager .s3Client .delete_object (Bucket = self .file_bucket , Key = key_name )
57
65
except Exception :
58
66
logger .exception (f'Failed to delete manifest for record: { record .source_id } ' )
59
67
finally :
@@ -94,7 +102,7 @@ def delete_pub_backlist_edition_only(self, record_uuid_str, work):
94
102
edition_es_response .delete ()
95
103
96
104
def get_metadata_file_name (self , record , record_metadata_dict ):
97
- key_format = f"{ self .prefix } { record .source } "
105
+ key_format = f"{ self .manifest_prefix } { record .source } "
98
106
99
107
if record_metadata_dict ['File ID 1' ]:
100
108
file_title = record_metadata_dict ['File ID 1' ]
@@ -121,9 +129,22 @@ def get_records(
121
129
for records_value in json_dict ['records' ]:
122
130
try :
123
131
record_metadata_dict = records_value ['fields' ]
132
+ file_id = f'{ self .drive_service .id_from_url (record_metadata_dict ["DRB_File Location" ])} '
133
+ file_name = self .drive_service .get_file_metadata (file_id ).get ("name" )
134
+ file = self .drive_service .get_drive_file (file_id )
135
+ if not file :
136
+ logger .warn (f'Could not retrieve file for { record_metadata_dict ["id" ]} from Drive, skipping' )
137
+ continue
138
+ bucket = self .file_bucket # TODO: if record is limited access, upload to limited access bucket
139
+ s3_path = f'{ self .title_prefix } /{ record_metadata_dict ["Publisher (from Projects)" ][0 ]} /{ file_name } '
140
+ s3_response = self .s3_manager .putObjectInBucket (file .getvalue (), s3_path , bucket )
141
+ if not s3_response .get ('ResponseMetadata' ).get ('HTTPStatusCode' ) == 200 :
142
+ logger .warn (f'Could not upload file for { record_metadata_dict ["id" ]} to s3, skipping' )
143
+ continue
144
+ s3_url = f'https://{ bucket } .s3.amazonaws.com/{ s3_path } '
124
145
pub_backlist_record = PublisherBacklistMapping (record_metadata_dict )
125
146
pub_backlist_record .applyMapping ()
126
- self .add_has_part_mapping (pub_backlist_record .record )
147
+ self .add_has_part_mapping (pub_backlist_record .record , s3_url )
127
148
self .store_pdf_manifest (pub_backlist_record .record )
128
149
complete_records .append (pub_backlist_record )
129
150
except Exception :
@@ -138,11 +159,8 @@ def get_records_json(self,
138
159
) -> list [dict ]:
139
160
if offset == None :
140
161
limit = 100
141
-
142
- limit = offset
143
-
144
- filter_by_formula = self .build_filter_by_formula_parameter (deleted = False , full_import = None , start_timestamp = None )
145
-
162
+
163
+ filter_by_formula = self .build_filter_by_formula_parameter (deleted = False , full_import = full_import , start_timestamp = start_timestamp )
146
164
array_json_records = self .get_records_array (limit , filter_by_formula )
147
165
148
166
return array_json_records
@@ -194,15 +212,12 @@ def get_records_array(self,
194
212
195
213
return array_json
196
214
197
- def add_has_part_mapping (self , record ):
198
-
199
- #GOOGLE DRIVE API CALL TO GET PDF/EPUB FILES
200
-
215
+ def add_has_part_mapping (self , record , s3_url ):
201
216
try :
202
217
if 'in_copyright' in record .rights :
203
218
link_string = '|' .join ([
204
219
'1' ,
205
- #LINK TO PDF/EPUB ,
220
+ s3_url ,
206
221
record .source ,
207
222
'application/pdf' ,
208
223
'{"catalog": false, "download": true, "reader": false, "embed": false, "nypl_login": true}'
@@ -212,7 +227,7 @@ def add_has_part_mapping(self, record):
212
227
if 'public_domain' in record .rights :
213
228
link_string = '|' .join ([
214
229
'1' ,
215
- #LINK TO PDF/EPUB ,
230
+ s3_url ,
216
231
record .source ,
217
232
'application/pdf' ,
218
233
'{"catalog": false, "download": true, "reader": false, "embed": false}'
@@ -228,14 +243,14 @@ def store_pdf_manifest(self, record):
228
243
229
244
if media_type == 'application/pdf' :
230
245
record_id = record .identifiers [0 ].split ('|' )[0 ]
231
- manifest_path = f'{ self .prefix } /{ source } /{ record_id } .json'
246
+ manifest_path = f'{ self .manifest_prefix } /{ source } /{ record_id } .json'
232
247
manifest_url = 'https://{}.s3.amazonaws.com/{}' .format (
233
- self .s3_bucket , manifest_path
248
+ self .file_bucket , manifest_path
234
249
)
235
250
236
251
manifest_json = self .generate_manifest (record , url , manifest_url )
237
252
238
- self .s3_manager .createManifestInS3 (manifest_path , manifest_json , self .s3_bucket )
253
+ self .s3_manager .createManifestInS3 (manifest_path , manifest_json , self .file_bucket )
239
254
240
255
if 'in_copyright' in record .rights :
241
256
link_string = '|' .join ([
0 commit comments