Skip to content

Commit 3814252

Browse files
authored
Sfr 2390 download files from Drive and upload them to s3 as part of publishers project process (#492)
Grab drive files and upload them to s3
1 parent 624c229 commit 3814252

File tree

2 files changed

+40
-25
lines changed

2 files changed

+40
-25
lines changed

managers/s3.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,8 @@ def putObjectInBucket(
8080
ContentType=objectType,
8181
Metadata={'md5Checksum': objMD5}
8282
)
83-
except ClientError:
84-
raise S3Error('Unable to store file in s3')
83+
except ClientError as e:
84+
raise S3Error(f'Unable to store file {objKey} in s3: {e}')
8585

8686
def putExplodedEpubComponentsInBucket(self, obj, objKey, bucket):
8787
keyRoot = '.'.join(objKey.split('.')[:-1])

services/sources/publisher_backlist_service.py

+38-23
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
from logger import create_log
1010
from mappings.publisher_backlist import PublisherBacklistMapping
1111
from managers import S3Manager, WebpubManifest
12+
from services.ssm_service import SSMService
13+
from services.google_drive_service import GoogleDriveService
1214
from .source_service import SourceService
1315
from managers import DBManager, ElasticsearchManager
1416
from elasticsearch_dsl import Search, Q
@@ -20,23 +22,29 @@
2022
class PublisherBacklistService(SourceService):
2123
def __init__(self):
2224

25+
self.ssm_service = SSMService()
2326
self.s3_manager = S3Manager()
2427
self.s3_manager.createS3Client()
25-
self.s3_bucket = os.environ['FILE_BUCKET']
26-
self.prefix = 'manifests/publisher_backlist'
28+
self.file_bucket = os.environ['FILE_BUCKET']
29+
self.drive_service = GoogleDriveService()
30+
self.manifest_prefix = 'manifests/publisher_backlist'
31+
self.title_prefix = 'titles/publisher_backlist'
2732
self.db_manager = DBManager()
2833
self.db_manager.generateEngine()
2934
self.es_manager = ElasticsearchManager()
3035
self.es_manager.createElasticConnection()
31-
32-
self.airtable_auth_token = os.environ.get('AIRTABLE_KEY', None)
36+
37+
if os.environ['ENVIRONMENT'] == 'production':
38+
self.airtable_auth_token = self.ssm_service.get_parameter('arn:aws:ssm:us-east-1:946183545209:parameter/drb/production/airtable/pub-backlist/api-key')
39+
else:
40+
self.airtable_auth_token = self.ssm_service.get_parameter('arn:aws:ssm:us-east-1:946183545209:parameter/drb/qa/airtable/pub-backlist/api-key')
3341

3442
def delete_records(
3543
self,
3644
limit: Optional[int]=None
3745
):
3846
filter_by_formula = self.build_filter_by_formula_parameter(deleted=True)
39-
47+
4048
array_json_records = self.get_records_array(limit, filter_by_formula)
4149

4250
for json_dict in array_json_records:
@@ -46,14 +54,14 @@ def delete_records(
4654
record_metadata_dict = records_value['fields']
4755
self.delete_manifest(self.db_manager, record_metadata_dict)
4856
self.delete_work(record_metadata_dict)
49-
57+
5058
def delete_manifest(self, record_metadata_dict):
5159
self.db_manager.createSession()
5260
try:
5361
record = self.db_manager.session.query(Record).filter(Record.source_id == record_metadata_dict['DRB Record_ID']).first()
5462
if record:
5563
key_name = self.get_metadata_file_name(record, record_metadata_dict)
56-
self.s3_manager.s3Client.delete_object(Bucket= self.s3_bucket, Key= key_name)
64+
self.s3_manager.s3Client.delete_object(Bucket= self.file_bucket, Key= key_name)
5765
except Exception:
5866
logger.exception(f'Failed to delete manifest for record: {record.source_id}')
5967
finally:
@@ -94,7 +102,7 @@ def delete_pub_backlist_edition_only(self, record_uuid_str, work):
94102
edition_es_response.delete()
95103

96104
def get_metadata_file_name(self, record, record_metadata_dict):
97-
key_format = f"{self.prefix}{record.source}"
105+
key_format = f"{self.manifest_prefix}{record.source}"
98106

99107
if record_metadata_dict['File ID 1']:
100108
file_title = record_metadata_dict['File ID 1']
@@ -121,9 +129,22 @@ def get_records(
121129
for records_value in json_dict['records']:
122130
try:
123131
record_metadata_dict = records_value['fields']
132+
file_id = f'{self.drive_service.id_from_url(record_metadata_dict["DRB_File Location"])}'
133+
file_name = self.drive_service.get_file_metadata(file_id).get("name")
134+
file = self.drive_service.get_drive_file(file_id)
135+
if not file:
136+
logger.warn(f'Could not retrieve file for {record_metadata_dict["id"]} from Drive, skipping')
137+
continue
138+
bucket = self.file_bucket # TODO: if record is limited access, upload to limited access bucket
139+
s3_path = f'{self.title_prefix}/{record_metadata_dict["Publisher (from Projects)"][0]}/{file_name}'
140+
s3_response = self.s3_manager.putObjectInBucket(file.getvalue(), s3_path, bucket)
141+
if not s3_response.get('ResponseMetadata').get('HTTPStatusCode') == 200:
142+
logger.warn(f'Could not upload file for {record_metadata_dict["id"]} to s3, skipping')
143+
continue
144+
s3_url = f'https://{bucket}.s3.amazonaws.com/{s3_path}'
124145
pub_backlist_record = PublisherBacklistMapping(record_metadata_dict)
125146
pub_backlist_record.applyMapping()
126-
self.add_has_part_mapping(pub_backlist_record.record)
147+
self.add_has_part_mapping(pub_backlist_record.record, s3_url)
127148
self.store_pdf_manifest(pub_backlist_record.record)
128149
complete_records.append(pub_backlist_record)
129150
except Exception:
@@ -138,11 +159,8 @@ def get_records_json(self,
138159
) -> list[dict]:
139160
if offset == None:
140161
limit = 100
141-
142-
limit = offset
143-
144-
filter_by_formula = self.build_filter_by_formula_parameter(deleted=False, full_import=None, start_timestamp=None)
145-
162+
163+
filter_by_formula = self.build_filter_by_formula_parameter(deleted=False, full_import=full_import, start_timestamp=start_timestamp)
146164
array_json_records = self.get_records_array(limit, filter_by_formula)
147165

148166
return array_json_records
@@ -194,15 +212,12 @@ def get_records_array(self,
194212

195213
return array_json
196214

197-
def add_has_part_mapping(self, record):
198-
199-
#GOOGLE DRIVE API CALL TO GET PDF/EPUB FILES
200-
215+
def add_has_part_mapping(self, record, s3_url):
201216
try:
202217
if 'in_copyright' in record.rights:
203218
link_string = '|'.join([
204219
'1',
205-
#LINK TO PDF/EPUB,
220+
s3_url,
206221
record.source,
207222
'application/pdf',
208223
'{"catalog": false, "download": true, "reader": false, "embed": false, "nypl_login": true}'
@@ -212,7 +227,7 @@ def add_has_part_mapping(self, record):
212227
if 'public_domain' in record.rights:
213228
link_string = '|'.join([
214229
'1',
215-
#LINK TO PDF/EPUB,
230+
s3_url,
216231
record.source,
217232
'application/pdf',
218233
'{"catalog": false, "download": true, "reader": false, "embed": false}'
@@ -228,14 +243,14 @@ def store_pdf_manifest(self, record):
228243

229244
if media_type == 'application/pdf':
230245
record_id = record.identifiers[0].split('|')[0]
231-
manifest_path = f'{self.prefix}/{source}/{record_id}.json'
246+
manifest_path = f'{self.manifest_prefix}/{source}/{record_id}.json'
232247
manifest_url = 'https://{}.s3.amazonaws.com/{}'.format(
233-
self.s3_bucket, manifest_path
248+
self.file_bucket, manifest_path
234249
)
235250

236251
manifest_json = self.generate_manifest(record, url, manifest_url)
237252

238-
self.s3_manager.createManifestInS3(manifest_path, manifest_json, self.s3_bucket)
253+
self.s3_manager.createManifestInS3(manifest_path, manifest_json, self.file_bucket)
239254

240255
if 'in_copyright' in record.rights:
241256
link_string = '|'.join([

0 commit comments

Comments
 (0)