arches_extensions.management.commands.get_files
1import csv 2import logging 3from pathlib import Path 4from zipfile import ZipFile 5 6from django.core.management.base import BaseCommand 7 8from arches.app.models.models import Node, File 9from arches.app.models.resource import Resource 10from arches.app.models.graph import Graph 11from arches.app.models.tile import Tile 12 13from arches_extensions.utils import ArchesHelpTextFormatter 14 15logger = logging.getLogger(__name__) 16 17class Command(BaseCommand): 18 """Generate a list of file names for file-list nodes within the 19specific resources. 20 21 .. warning:: 22 This command is a work-in-progress 23 24 Usage: 25 26 python manage.py get_files 27 28 Arguments: 29 30 - `--resource`: Id for single resource instance to include (optional) 31 - `--graph`: Name of graph, all instances will be included (optional) 32 - `--make-csv`: Exports a CSV list of all file info, named for graph or instance (default=False) 33 - `--make-archive`: Creates a zip archive of all files, named for graph or instance (default=False) 34 - `--include-orphans`: Includes File objects that are no longer in tile data, but do exist in the database (default=False) 35 36 """ 37 38 def __init__(self, *args, **kwargs): 39 self.help = self.__doc__ 40 41 def add_arguments(self, parser): 42 43 parser.formatter_class = ArchesHelpTextFormatter 44 45 parser.add_argument("--resource") 46 parser.add_argument("--graph") 47 parser.add_argument("--make-csv", action="store_true") 48 parser.add_argument("--make-archive", action="store_true") 49 parser.add_argument("--include-orphans", action="store_true") 50 51 def handle(self, *args, **options): 52 53 scopes = [] 54 55 ## get all relevant file objects 56 if options["resource"]: 57 scopes.append((options["resource"], File.objects.filter(tile__resourceinstance_id=scope))) 58 elif options["graph"]: 59 scopes.append((options["graph"], File.objects.filter(tile__resourceinstance__graph__name=options["graph"]))) 60 else: 61 for graph in Graph.objects.filter(isresource=True).exclude(name="Arches System Settings"): 62 scopes.append((graph.name, File.objects.filter(tile__resourceinstance__graph=graph))) 63 64 ## process all scopes 65 for scope, files in scopes: 66 print(f"Getting files for: {scope}") 67 info = self.collect_file_info(files, include_orphans=options["include_orphans"]) 68 if options["make_csv"]: 69 if len(info) == 0: 70 print("no data to write, skipping") 71 else: 72 for i in info: 73 del i['file path'] 74 csv_name = f"{scope}__filelist" 75 if options["include_orphans"]: 76 csv_name += "__withorphans" 77 with open(f"{csv_name}.csv", "w") as o: 78 writer = csv.DictWriter(o, fieldnames=info[0].keys()) 79 writer.writeheader() 80 writer.writerows(info) 81 82 if options["make_archive"]: 83 if len(info) == 0: 84 print("no files to archive, skipping") 85 else: 86 usefiles = File.objects.filter(fileid__in=[i['file id'] for i in info]) 87 zip_name = f"{scope}__files" 88 if options["include_orphans"]: 89 zip_name += "__withorphans" 90 with ZipFile(f"{zip_name}.zip", "w") as zip_file: 91 for f in usefiles: 92 with f.path.open("rb") as content: 93 zip_file.writestr(f.path.name.split("/")[-1], content.read()) 94 95 def collect_file_info(self, files, include_orphans=False): 96 print(f"File objects: {files.count()}") 97 98 ## quick check for missing files 99 missing = [] 100 for f in files: 101 if not f.path.storage.exists(f.path.name): 102 print("file missing:", f.path.path) 103 missing.append(f) 104 print(f"Missing files to be skipped: {len(missing)}") 105 files = [i for i in files if i not in missing] 106 107 ## iterate files and create a lookup for tiles they are referenced by 108 file_lookup = {} 109 tile_lookup = {} 110 for f in files: 111 file_lookup[str(f.fileid)] = f 112 tileid = str(f.tile_id) 113 if tileid not in tile_lookup: 114 tile_lookup[tileid] = Tile.objects.get(pk=tileid) 115 print(f"Tiles with files in them: {len(tile_lookup)}") 116 117 ## iterate all tiles that have been identified to have files, 118 ## and collect the file ids that are stored in the tile data 119 ## also make lookups for resources and nodes along the way for later use 120 node_lookup = {} 121 res_lookup = {} 122 file_info = [] 123 files_without_id = [] 124 orphan_total = 0 125 matched_total = 0 126 for t, tile in tile_lookup.items(): 127 resid = str(tile.resourceinstance_id) 128 res = res_lookup.get(resid, Resource.objects.get(pk=resid)) 129 if resid not in res_lookup: 130 res_lookup[resid] = res 131 found_ids = [] 132 for k, v in tile.data.items(): 133 node = node_lookup.get(k, Node.objects.get(pk=k)) 134 if k not in node_lookup: 135 node_lookup[k] = node 136 if node.datatype == "file-list": 137 for i in v: 138 id = str(i['file_id']) 139 if id == "None": 140 files_without_id.append(i) 141 continue 142 found_ids.append(id) 143 matched_total += 1 144 file_info.append({ 145 "resource id": resid, 146 "resource name": res.displayname, 147 "node name": node.name, 148 "file id": id, 149 "file name (original)": i['name'], 150 "file name (actual)": Path(file_lookup[id].path.name).name, 151 "file path": file_lookup[id].path.path, 152 }) 153 orphans = File.objects.filter(tile=tile).exclude(fileid__in=found_ids) 154 orphan_total += orphans.count() 155 if include_orphans: 156 for orphan in orphans: 157 id = str(orphan.pk) 158 file_info.append({ 159 "resource id": resid, 160 "resource name": res.displayname, 161 "node name": "<unknown>", 162 "file id": id, 163 "file name (original)": "<unknown>", 164 "file name (actual)": Path(file_lookup[id].path.name).name, 165 "file path": file_lookup[id].path.path, 166 }) 167 168 print(f"Files in tiles without fileids: {len(files_without_id)}") 169 file_info.sort(key=lambda x: x['resource name']) 170 print(f"Number of files actually referenced in tiles: {matched_total}") 171 print(f"Number of orphaned files: {orphan_total}") 172 return file_info
logger =
<Logger arches_extensions.management.commands.get_files (WARNING)>
class
Command(django.core.management.base.BaseCommand):
18class Command(BaseCommand): 19 """Generate a list of file names for file-list nodes within the 20specific resources. 21 22 .. warning:: 23 This command is a work-in-progress 24 25 Usage: 26 27 python manage.py get_files 28 29 Arguments: 30 31 - `--resource`: Id for single resource instance to include (optional) 32 - `--graph`: Name of graph, all instances will be included (optional) 33 - `--make-csv`: Exports a CSV list of all file info, named for graph or instance (default=False) 34 - `--make-archive`: Creates a zip archive of all files, named for graph or instance (default=False) 35 - `--include-orphans`: Includes File objects that are no longer in tile data, but do exist in the database (default=False) 36 37 """ 38 39 def __init__(self, *args, **kwargs): 40 self.help = self.__doc__ 41 42 def add_arguments(self, parser): 43 44 parser.formatter_class = ArchesHelpTextFormatter 45 46 parser.add_argument("--resource") 47 parser.add_argument("--graph") 48 parser.add_argument("--make-csv", action="store_true") 49 parser.add_argument("--make-archive", action="store_true") 50 parser.add_argument("--include-orphans", action="store_true") 51 52 def handle(self, *args, **options): 53 54 scopes = [] 55 56 ## get all relevant file objects 57 if options["resource"]: 58 scopes.append((options["resource"], File.objects.filter(tile__resourceinstance_id=scope))) 59 elif options["graph"]: 60 scopes.append((options["graph"], File.objects.filter(tile__resourceinstance__graph__name=options["graph"]))) 61 else: 62 for graph in Graph.objects.filter(isresource=True).exclude(name="Arches System Settings"): 63 scopes.append((graph.name, File.objects.filter(tile__resourceinstance__graph=graph))) 64 65 ## process all scopes 66 for scope, files in scopes: 67 print(f"Getting files for: {scope}") 68 info = self.collect_file_info(files, include_orphans=options["include_orphans"]) 69 if options["make_csv"]: 70 if len(info) == 0: 71 print("no data to write, skipping") 72 else: 73 for i in info: 74 del i['file path'] 75 csv_name = f"{scope}__filelist" 76 if options["include_orphans"]: 77 csv_name += "__withorphans" 78 with open(f"{csv_name}.csv", "w") as o: 79 writer = csv.DictWriter(o, fieldnames=info[0].keys()) 80 writer.writeheader() 81 writer.writerows(info) 82 83 if options["make_archive"]: 84 if len(info) == 0: 85 print("no files to archive, skipping") 86 else: 87 usefiles = File.objects.filter(fileid__in=[i['file id'] for i in info]) 88 zip_name = f"{scope}__files" 89 if options["include_orphans"]: 90 zip_name += "__withorphans" 91 with ZipFile(f"{zip_name}.zip", "w") as zip_file: 92 for f in usefiles: 93 with f.path.open("rb") as content: 94 zip_file.writestr(f.path.name.split("/")[-1], content.read()) 95 96 def collect_file_info(self, files, include_orphans=False): 97 print(f"File objects: {files.count()}") 98 99 ## quick check for missing files 100 missing = [] 101 for f in files: 102 if not f.path.storage.exists(f.path.name): 103 print("file missing:", f.path.path) 104 missing.append(f) 105 print(f"Missing files to be skipped: {len(missing)}") 106 files = [i for i in files if i not in missing] 107 108 ## iterate files and create a lookup for tiles they are referenced by 109 file_lookup = {} 110 tile_lookup = {} 111 for f in files: 112 file_lookup[str(f.fileid)] = f 113 tileid = str(f.tile_id) 114 if tileid not in tile_lookup: 115 tile_lookup[tileid] = Tile.objects.get(pk=tileid) 116 print(f"Tiles with files in them: {len(tile_lookup)}") 117 118 ## iterate all tiles that have been identified to have files, 119 ## and collect the file ids that are stored in the tile data 120 ## also make lookups for resources and nodes along the way for later use 121 node_lookup = {} 122 res_lookup = {} 123 file_info = [] 124 files_without_id = [] 125 orphan_total = 0 126 matched_total = 0 127 for t, tile in tile_lookup.items(): 128 resid = str(tile.resourceinstance_id) 129 res = res_lookup.get(resid, Resource.objects.get(pk=resid)) 130 if resid not in res_lookup: 131 res_lookup[resid] = res 132 found_ids = [] 133 for k, v in tile.data.items(): 134 node = node_lookup.get(k, Node.objects.get(pk=k)) 135 if k not in node_lookup: 136 node_lookup[k] = node 137 if node.datatype == "file-list": 138 for i in v: 139 id = str(i['file_id']) 140 if id == "None": 141 files_without_id.append(i) 142 continue 143 found_ids.append(id) 144 matched_total += 1 145 file_info.append({ 146 "resource id": resid, 147 "resource name": res.displayname, 148 "node name": node.name, 149 "file id": id, 150 "file name (original)": i['name'], 151 "file name (actual)": Path(file_lookup[id].path.name).name, 152 "file path": file_lookup[id].path.path, 153 }) 154 orphans = File.objects.filter(tile=tile).exclude(fileid__in=found_ids) 155 orphan_total += orphans.count() 156 if include_orphans: 157 for orphan in orphans: 158 id = str(orphan.pk) 159 file_info.append({ 160 "resource id": resid, 161 "resource name": res.displayname, 162 "node name": "<unknown>", 163 "file id": id, 164 "file name (original)": "<unknown>", 165 "file name (actual)": Path(file_lookup[id].path.name).name, 166 "file path": file_lookup[id].path.path, 167 }) 168 169 print(f"Files in tiles without fileids: {len(files_without_id)}") 170 file_info.sort(key=lambda x: x['resource name']) 171 print(f"Number of files actually referenced in tiles: {matched_total}") 172 print(f"Number of orphaned files: {orphan_total}") 173 return file_info
Generate a list of file names for file-list nodes within the specific resources.
This command is a work-in-progress
Usage:
python manage.py get_files
Arguments:
- `--resource`: Id for single resource instance to include (optional)
- `--graph`: Name of graph, all instances will be included (optional)
- `--make-csv`: Exports a CSV list of all file info, named for graph or instance (default=False)
- `--make-archive`: Creates a zip archive of all files, named for graph or instance (default=False)
- `--include-orphans`: Includes File objects that are no longer in tile data, but do exist in the database (default=False)
def
add_arguments(self, parser):
42 def add_arguments(self, parser): 43 44 parser.formatter_class = ArchesHelpTextFormatter 45 46 parser.add_argument("--resource") 47 parser.add_argument("--graph") 48 parser.add_argument("--make-csv", action="store_true") 49 parser.add_argument("--make-archive", action="store_true") 50 parser.add_argument("--include-orphans", action="store_true")
Entry point for subclassed commands to add custom arguments.
def
handle(self, *args, **options):
52 def handle(self, *args, **options): 53 54 scopes = [] 55 56 ## get all relevant file objects 57 if options["resource"]: 58 scopes.append((options["resource"], File.objects.filter(tile__resourceinstance_id=scope))) 59 elif options["graph"]: 60 scopes.append((options["graph"], File.objects.filter(tile__resourceinstance__graph__name=options["graph"]))) 61 else: 62 for graph in Graph.objects.filter(isresource=True).exclude(name="Arches System Settings"): 63 scopes.append((graph.name, File.objects.filter(tile__resourceinstance__graph=graph))) 64 65 ## process all scopes 66 for scope, files in scopes: 67 print(f"Getting files for: {scope}") 68 info = self.collect_file_info(files, include_orphans=options["include_orphans"]) 69 if options["make_csv"]: 70 if len(info) == 0: 71 print("no data to write, skipping") 72 else: 73 for i in info: 74 del i['file path'] 75 csv_name = f"{scope}__filelist" 76 if options["include_orphans"]: 77 csv_name += "__withorphans" 78 with open(f"{csv_name}.csv", "w") as o: 79 writer = csv.DictWriter(o, fieldnames=info[0].keys()) 80 writer.writeheader() 81 writer.writerows(info) 82 83 if options["make_archive"]: 84 if len(info) == 0: 85 print("no files to archive, skipping") 86 else: 87 usefiles = File.objects.filter(fileid__in=[i['file id'] for i in info]) 88 zip_name = f"{scope}__files" 89 if options["include_orphans"]: 90 zip_name += "__withorphans" 91 with ZipFile(f"{zip_name}.zip", "w") as zip_file: 92 for f in usefiles: 93 with f.path.open("rb") as content: 94 zip_file.writestr(f.path.name.split("/")[-1], content.read())
The actual logic of the command. Subclasses must implement this method.
def
collect_file_info(self, files, include_orphans=False):
96 def collect_file_info(self, files, include_orphans=False): 97 print(f"File objects: {files.count()}") 98 99 ## quick check for missing files 100 missing = [] 101 for f in files: 102 if not f.path.storage.exists(f.path.name): 103 print("file missing:", f.path.path) 104 missing.append(f) 105 print(f"Missing files to be skipped: {len(missing)}") 106 files = [i for i in files if i not in missing] 107 108 ## iterate files and create a lookup for tiles they are referenced by 109 file_lookup = {} 110 tile_lookup = {} 111 for f in files: 112 file_lookup[str(f.fileid)] = f 113 tileid = str(f.tile_id) 114 if tileid not in tile_lookup: 115 tile_lookup[tileid] = Tile.objects.get(pk=tileid) 116 print(f"Tiles with files in them: {len(tile_lookup)}") 117 118 ## iterate all tiles that have been identified to have files, 119 ## and collect the file ids that are stored in the tile data 120 ## also make lookups for resources and nodes along the way for later use 121 node_lookup = {} 122 res_lookup = {} 123 file_info = [] 124 files_without_id = [] 125 orphan_total = 0 126 matched_total = 0 127 for t, tile in tile_lookup.items(): 128 resid = str(tile.resourceinstance_id) 129 res = res_lookup.get(resid, Resource.objects.get(pk=resid)) 130 if resid not in res_lookup: 131 res_lookup[resid] = res 132 found_ids = [] 133 for k, v in tile.data.items(): 134 node = node_lookup.get(k, Node.objects.get(pk=k)) 135 if k not in node_lookup: 136 node_lookup[k] = node 137 if node.datatype == "file-list": 138 for i in v: 139 id = str(i['file_id']) 140 if id == "None": 141 files_without_id.append(i) 142 continue 143 found_ids.append(id) 144 matched_total += 1 145 file_info.append({ 146 "resource id": resid, 147 "resource name": res.displayname, 148 "node name": node.name, 149 "file id": id, 150 "file name (original)": i['name'], 151 "file name (actual)": Path(file_lookup[id].path.name).name, 152 "file path": file_lookup[id].path.path, 153 }) 154 orphans = File.objects.filter(tile=tile).exclude(fileid__in=found_ids) 155 orphan_total += orphans.count() 156 if include_orphans: 157 for orphan in orphans: 158 id = str(orphan.pk) 159 file_info.append({ 160 "resource id": resid, 161 "resource name": res.displayname, 162 "node name": "<unknown>", 163 "file id": id, 164 "file name (original)": "<unknown>", 165 "file name (actual)": Path(file_lookup[id].path.name).name, 166 "file path": file_lookup[id].path.path, 167 }) 168 169 print(f"Files in tiles without fileids: {len(files_without_id)}") 170 file_info.sort(key=lambda x: x['resource name']) 171 print(f"Number of files actually referenced in tiles: {matched_total}") 172 print(f"Number of orphaned files: {orphan_total}") 173 return file_info