Skip to content

Commit aa8c4c3

Browse files
committed
Compare an accruals location to an aip store
This commit introduces an accruals->aips comparison capability. Digital objects in an accruals folder can now be compared to the contents of an AIP store. Where filepaths and checksums and dates match, the object is considered to be identical (a true duplicate). Where they don't, users can use modulo (%) to identify where the object isn't in fact identical. Much of the benefit of this work is derived from the nature of the AIP structure imposed on a digital transfer. Once the comparison is complete, three reports are output in CSV format: * True-duplicates. * Near-duplicates (checksums match, but other components might not). * Non-duplicates. Additionally a summary report output in JSON.
1 parent a0d4eee commit aa8c4c3

11 files changed

+568
-151
lines changed

reports/duplicates/accruals.py

+161
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
from __future__ import print_function, unicode_literals
5+
6+
import copy
7+
import logging
8+
import os
9+
import sys
10+
11+
try:
12+
from .appconfig import AppConfig
13+
from .digital_object import DigitalObject
14+
from . import duplicates
15+
from . import loggingconfig
16+
from .serialize_to_csv import CSVOut
17+
from . import utils
18+
except (ValueError, ImportError):
19+
from appconfig import AppConfig
20+
from digital_object import DigitalObject
21+
import duplicates
22+
import loggingconfig
23+
from serialize_to_csv import CSVOut
24+
import utils
25+
26+
logging_dir = os.path.dirname(os.path.abspath(__file__))
27+
28+
logger = logging.getLogger("accruals")
29+
logger.disabled = False
30+
31+
# Location purpose = Transfer Source (TS)
32+
location_purpose = "TS"
33+
default_location = AppConfig().accruals_transfer_source
34+
35+
36+
# Do something with this...
37+
DOCKER = True
38+
39+
# Store our appraisal paths.
40+
accrual_paths = []
41+
42+
43+
def create_manifest(aip_index, accrual_objs):
44+
"""do something."""
45+
dupes = []
46+
near_matches = []
47+
non_matches = []
48+
aip_obj_hashes = aip_index.get(duplicates.MANIFEST_DATA)
49+
for accrual_obj in accrual_objs:
50+
for accrual_hash in accrual_obj.hashes:
51+
if accrual_hash in aip_obj_hashes.keys():
52+
for _, aip_items in aip_obj_hashes.items():
53+
for aip_item in aip_items:
54+
if accrual_obj == aip_item:
55+
accrual_obj.flag = True
56+
cp = copy.copy(accrual_obj)
57+
cp.package_name = aip_item.package_name
58+
dupes.append(cp)
59+
else:
60+
diff = accrual_obj % aip_item
61+
if (
62+
diff == "No matching components"
63+
or "checksum match" not in diff
64+
):
65+
"""Don't output."""
66+
continue
67+
accrual_obj.flag = True
68+
cp1 = copy.copy(accrual_obj)
69+
cp2 = copy.copy(aip_item)
70+
near_matches.append([cp1, cp2])
71+
# Only need one hash to match then break.
72+
# May also be redundant as we only have one hash from the
73+
# bag manifests...
74+
break
75+
for accrual_obj in accrual_objs:
76+
if accrual_obj.flag is False:
77+
cp = copy.copy(accrual_obj)
78+
if cp not in non_matches:
79+
non_matches.append(cp)
80+
return dupes, near_matches, non_matches
81+
82+
83+
def create_comparison_obj(transfer_path):
84+
"""Do something."""
85+
transfer_arr = []
86+
for root, dirs, files in os.walk(transfer_path, topdown=True):
87+
for name in files:
88+
file_ = os.path.join(root, name)
89+
if os.path.isfile(file_):
90+
transfer_arr.append(DigitalObject(file_, transfer_path))
91+
return transfer_arr
92+
93+
94+
def stat_transfers(accruals_path, all_transfers):
95+
"""Retrieve all transfer paths and make a request to generate statistics
96+
about all the objects in that transfer path.
97+
"""
98+
aip_index = duplicates.retrieve_aip_index()
99+
dupe_reports = []
100+
near_reports = []
101+
no_match_reports = []
102+
transfers = []
103+
for transfer in all_transfers:
104+
transfer_home = os.path.join(accruals_path, transfer)
105+
if DOCKER:
106+
transfer_home = utils.get_docker_path(transfer_home)
107+
objs = create_comparison_obj(transfer_home)
108+
transfers.append(objs)
109+
match_manifest, near_manifest, no_match_manifest = create_manifest(
110+
aip_index, objs
111+
)
112+
if match_manifest:
113+
dupe_reports.append({transfer: match_manifest})
114+
if near_manifest:
115+
near_reports.append({transfer: near_manifest})
116+
if no_match_manifest:
117+
no_match_reports.append({transfer: no_match_manifest})
118+
CSVOut.stat_manifests(aip_index, transfers)
119+
if dupe_reports:
120+
CSVOut.dupe_csv_out(dupe_reports, "")
121+
if near_reports:
122+
CSVOut.near_csv_out(near_reports, "")
123+
if no_match_reports:
124+
CSVOut.no_match_csv_out(no_match_reports, "")
125+
126+
127+
def main(location=default_location):
128+
"""Primary entry point for this script."""
129+
130+
am = AppConfig().get_am_client()
131+
sources = am.list_storage_locations()
132+
133+
accruals = False
134+
for source in sources.get("objects"):
135+
if (
136+
source.get("purpose") == location_purpose
137+
and source.get("description") == location
138+
):
139+
"""do something."""
140+
am.transfer_source = source.get("uuid")
141+
am.transfer_path = source.get("path")
142+
accruals = True
143+
if not accruals:
144+
logger.info("Exiting. No transfer source: {}".format(location))
145+
sys.exit()
146+
147+
# All transfer directories. Assumption is the same as Archivematica that
148+
# each transfer is organized into a single directory at this level.
149+
all_transfers = am.transferables().get("directories")
150+
stat_transfers(am.transfer_path, all_transfers)
151+
152+
153+
if __name__ == "__main__":
154+
loggingconfig.setup("INFO", os.path.join(logging_dir, "report.log"))
155+
source = default_location
156+
try:
157+
source = sys.argv[1:][0]
158+
logger.error("Attempting to find transfers at: %s", source)
159+
except IndexError:
160+
pass
161+
sys.exit(main(source))

reports/duplicates/appconfig.py

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ def _load_config(self, config_file):
1919
self.storage_service_user = conf.get("storage_service_user")
2020
self.storage_service_api_key = conf.get("storage_service_api_key")
2121
self.storage_service_url = conf.get("storage_service_url")
22+
self.accruals_transfer_source = conf.get("accruals_transfer_source")
2223

2324
def get_am_client(self):
2425
"""Return an Archivematica API client to the caller."""

reports/duplicates/config.json

+4-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
{
2-
"storage_service_url": "http://127.0.0.1:62081",
3-
"storage_service_user": "test",
4-
"storage_service_api_key": "test"
2+
"storage_service_url": "http://127.0.0.1:62081",
3+
"storage_service_user": "test",
4+
"storage_service_api_key": "test",
5+
"accruals_transfer_source": "accruals",
56
}

reports/duplicates/digital_object.py

+129
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
"""Digital object class to help with matching."""
5+
6+
import json
7+
import os
8+
import time
9+
10+
try:
11+
from . import hashutils
12+
except (ValueError, ImportError):
13+
import hashutils
14+
15+
16+
class DigitalObjectException(Exception):
17+
"""If there's a problem raise this."""
18+
19+
20+
class DigitalObject(object):
21+
22+
# Object members.
23+
basename = None
24+
date_modified = None
25+
dirname = None
26+
filepath = None
27+
hashes = None
28+
package_uuid = None
29+
package_name = None
30+
31+
def __init__(self, path=None, transfer_path=None):
32+
"""Populate the digital object metadata. If we don't supply a path
33+
we'll just return an empty object to be populated on our own terms.
34+
"""
35+
if not path:
36+
self.basename = None
37+
self.date_modified = None
38+
self.dirname = None
39+
self.filepath = None
40+
self.hashes = []
41+
self.package_uuid = None
42+
self.package_name = None
43+
self.flag = False
44+
45+
if path:
46+
if not transfer_path:
47+
raise DigitalObjectException("Transfer path isn't set")
48+
# Construct path as if it is in a Bag object.
49+
comparison_path = path.replace(
50+
transfer_path, os.path.join("data", "objects")
51+
)
52+
self.filepath = comparison_path
53+
self.set_basename(comparison_path)
54+
self.set_dirname(comparison_path)
55+
self.hashes = hashutils.hash(path)
56+
self.date_modified = self.get_timestamp(path)
57+
self.flag = False
58+
59+
def set_basename(self, path):
60+
"""do something."""
61+
self.basename = os.path.basename(path)
62+
63+
def set_dirname(self, path):
64+
"""do something."""
65+
self.dirname = os.path.dirname(path)
66+
67+
def as_dict(self):
68+
return self.__dict__
69+
70+
def __str__(self):
71+
"""Let's override this!"""
72+
return json.dumps(
73+
self.__dict__, sort_keys=True, indent=4, separators=(",", ": ")
74+
)
75+
76+
def __eq__(self, other):
77+
"""Comparison operator for the digital object class. If two hashes
78+
match, and the given file path, we will return True.
79+
"""
80+
ret = False
81+
for key in self.hashes.keys():
82+
if key in other.hashes.keys():
83+
ret = True
84+
break
85+
if self.filepath != other.filepath:
86+
ret = False
87+
if self.date_modified != other.date_modified:
88+
ret = False
89+
return ret
90+
91+
def __mod__(self, other):
92+
"""Modulo operator for the digital object class. If two hashes match,
93+
and the given file-path, then return zero. If there is any partial
94+
match, then return basis information. % is potentially useful for
95+
debugging, or enhanced reporting.
96+
"""
97+
if self.__eq__(other):
98+
return 0
99+
# ret is False, repurpose to return basis information.
100+
ret = ""
101+
for key in self.hashes.keys():
102+
if key in other.hashes.keys():
103+
msg = "checksum match"
104+
ret = self.__concat_basis__(ret, msg)
105+
break
106+
if self.date_modified == other.date_modified:
107+
msg = "date modified match"
108+
ret = self.__concat_basis__(ret, msg)
109+
if self.basename == other.basename:
110+
msg = "filename match"
111+
ret = self.__concat_basis__(ret, msg)
112+
if self.dirname == other.dirname:
113+
msg = "directory name match"
114+
ret = self.__concat_basis__(ret, msg)
115+
if not ret:
116+
return "No matching components"
117+
return ret
118+
119+
@staticmethod
120+
def __concat_basis__(ret, msg):
121+
"""Helper function to bring basis information together usefully."""
122+
if ret:
123+
return "{}; {}".format(ret, msg)
124+
return msg
125+
126+
@staticmethod
127+
def get_timestamp(path):
128+
"""do something."""
129+
return time.strftime("%Y-%m-%d", time.localtime(os.path.getmtime(path)))

0 commit comments

Comments
 (0)