Skip to content

Commit aaca9fa

Browse files
committed
Compare an accruals location to an aip store
accruals->aips functionality introduces a digital object that has the ability to compare itself to other objects of the same Class. Where filepaths and checksums and dates match, the object is identical. Where they don't, users can use modulo (%) to identify where the object isn't in fact identical. Much of the benefit of this work is derived from the nature of the AIP structure imposed on a digital transfer. That, however, might make 'true'-duplicate matching slightly more rare than just identifying checksum duplicates.
1 parent a0d4eee commit aaca9fa

8 files changed

+451
-49
lines changed

reports/duplicates/accruals.py

+126
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
from __future__ import print_function, unicode_literals
5+
6+
import copy
7+
import logging
8+
import os
9+
import sys
10+
11+
try:
12+
from .appconfig import AppConfig
13+
from .digital_object import DigitalObject
14+
from . import duplicates
15+
from . import loggingconfig
16+
from .serialize_to_csv import CSVOut
17+
from . import utils
18+
except ValueError:
19+
from appconfig import AppConfig
20+
from digital_object import DigitalObject
21+
import duplicates
22+
import loggingconfig
23+
from serialize_to_csv import CSVOut
24+
import utils
25+
26+
logging_dir = os.path.dirname(os.path.abspath(__file__))
27+
28+
logger = logging.getLogger("accruals")
29+
logger.disabled = False
30+
31+
# Location purpose = Transfer Source (TS)
32+
location_purpose = "TS"
33+
default_location = "accruals"
34+
35+
36+
# Do something with this...
37+
DOCKER = True
38+
39+
# Store our appraisal paths.
40+
accrual_paths = []
41+
42+
43+
def create_manifest(aip_index, accrual_objs):
44+
"""do something."""
45+
dupes = []
46+
aip_obj_hashes = aip_index.get(duplicates.MANIFEST_DATA)
47+
for accrual_obj in accrual_objs:
48+
for accrual_hash in accrual_obj.hashes:
49+
if accrual_hash in aip_obj_hashes.keys():
50+
for _, aip_items in aip_obj_hashes.items():
51+
for item in aip_items:
52+
if accrual_obj == item:
53+
cp = copy.copy(accrual_obj)
54+
cp.package_name = item.package_name
55+
dupes.append(cp)
56+
# Only need one hash to match then break.
57+
# May also be redundant as we only have one hash from the
58+
# bag manifests...
59+
break
60+
return dupes
61+
62+
63+
def create_comparison_obj(transfer_path):
64+
"""Do something."""
65+
transfer_arr = []
66+
for root, dirs, files in os.walk(transfer_path, topdown=True):
67+
for name in files:
68+
file_ = os.path.join(root, name)
69+
if os.path.isfile(file_):
70+
transfer_arr.append(DigitalObject(file_, transfer_path))
71+
return transfer_arr
72+
73+
74+
def stat_transfers(accruals_path, all_transfers):
75+
"""Retrieve all transfer paths and make a request to generate statistics
76+
about all the objects in that transfer path.
77+
"""
78+
aip_index = duplicates.retrieve_aip_index()
79+
reports = []
80+
transfers = []
81+
for transfer in all_transfers:
82+
transfer_home = os.path.join(accruals_path, transfer)
83+
if DOCKER:
84+
transfer_home = utils.get_docker_path(transfer_home)
85+
objs = create_comparison_obj(transfer_home)
86+
transfers.append(objs)
87+
reports.append({transfer: create_manifest(aip_index, objs)})
88+
CSVOut.stat_manifests(aip_index, transfers)
89+
CSVOut.csv_out(reports, "")
90+
91+
92+
def main(location=default_location):
93+
"""Primary entry point for this script."""
94+
95+
am = AppConfig().get_am_client()
96+
sources = am.list_storage_locations()
97+
98+
accruals = False
99+
for source in sources.get("objects"):
100+
if (
101+
source.get("purpose") == location_purpose
102+
and source.get("description") == location
103+
):
104+
"""do something."""
105+
am.transfer_source = source.get("uuid")
106+
am.transfer_path = source.get("path")
107+
accruals = True
108+
if not accruals:
109+
logger.info("Exiting. No transfer source: {}".format(location))
110+
sys.exit()
111+
112+
# All transfer directories. Assumption is the same as Archivematica that
113+
# each transfer is organized into a single directory at this level.
114+
all_transfers = am.transferables().get("directories")
115+
stat_transfers(am.transfer_path, all_transfers)
116+
117+
118+
if __name__ == "__main__":
119+
loggingconfig.setup("INFO", os.path.join(logging_dir, "report.log"))
120+
source = default_location
121+
try:
122+
source = sys.argv[1:][0]
123+
logger.error("Attempting to find transfers at: %s", source)
124+
except IndexError:
125+
pass
126+
sys.exit(main(source))

reports/duplicates/digital_object.py

+122
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
"""Digital object class to help with matching."""
5+
6+
import json
7+
import os
8+
import time
9+
10+
try:
11+
from . import hashutils
12+
except ImportError:
13+
import hashutils
14+
15+
16+
class DigitalObjectException(Exception):
17+
"""If there's a problem raise this."""
18+
19+
20+
class DigitalObject(object):
21+
22+
# Object members.
23+
basename = None
24+
date_modified = None
25+
dirname = None
26+
filepath = None
27+
hashes = None
28+
package_uuid = None
29+
package_name = None
30+
31+
def __init__(self, path=None, transfer_path=None):
32+
"""Populate the digital object metadata. If we don't supply a path
33+
we'll just return an empty object to be populated on our own terms.
34+
"""
35+
if not path:
36+
self.basename = None
37+
self.date_modified = None
38+
self.dirname = None
39+
self.filepath = None
40+
self.hashes = []
41+
self.package_uuid = None
42+
self.package_name = None
43+
44+
if path:
45+
if not transfer_path:
46+
raise DigitalObjectException("Transfer path isn't set")
47+
# Construct path as if it is in a Bag object.
48+
comparison_path = path.replace(
49+
transfer_path, os.path.join("data", "objects")
50+
)
51+
self.filepath = comparison_path
52+
self.set_basename(comparison_path)
53+
self.set_dirname(comparison_path)
54+
self.hashes = hashutils.hash(path)
55+
self.date_modified = self.get_timestamp(path)
56+
57+
def set_basename(self, path):
58+
"""do something."""
59+
self.basename = os.path.basename(path)
60+
61+
def set_dirname(self, path):
62+
"""do something."""
63+
self.dirname = os.path.dirname(path)
64+
65+
def as_dict(self):
66+
return self.__dict__
67+
68+
def __str__(self):
69+
"""Let's override this!"""
70+
return json.dumps(
71+
self.__dict__, sort_keys=True, indent=4, separators=(",", ": ")
72+
)
73+
74+
def __eq__(self, other):
75+
"""Comparison operator for the digital object class. If two hashes
76+
match, and the given file path, we will return True.
77+
"""
78+
ret = False
79+
for key in self.hashes.keys():
80+
if key in other.hashes.keys():
81+
ret = True
82+
break
83+
if self.filepath != other.filepath:
84+
ret = False
85+
if self.date_modified != other.date_modified:
86+
ret = False
87+
return ret
88+
89+
def __mod__(self, other):
90+
"""Modulo operator for the digital object class. If two hashes match,
91+
and the given file-path, then return zero. If there is any partial
92+
match, then return basis information. % is potentially useful for
93+
debugging, or enhanced reporting.
94+
"""
95+
if self.__eq__(other):
96+
return 0
97+
# ret is False, repurpose to return basis information.
98+
ret = ""
99+
if self.date_modified == other.date_modified:
100+
msg = "date modified match"
101+
ret = self.__concat_basis__(ret, msg)
102+
if self.basename == other.basename:
103+
msg = "filename match"
104+
ret = self.__concat_basis__(ret, msg)
105+
if self.dirname == other.dirname:
106+
msg = "directory name match"
107+
ret = self.__concat_basis__(ret, msg)
108+
if not ret:
109+
return "No matching components"
110+
return ret
111+
112+
@staticmethod
113+
def __concat_basis__(ret, msg):
114+
"""Helper function to bring basis information together usefully."""
115+
if ret:
116+
return "{}; {}".format(ret, msg)
117+
return msg
118+
119+
@staticmethod
120+
def get_timestamp(path):
121+
"""do something."""
122+
return time.strftime("%Y-%m-%d", time.localtime(os.path.getmtime(path)))

0 commit comments

Comments
 (0)