arches_extensions.management.commands.get_files

View Source

  1import csv
  2import logging
  3from pathlib import Path
  4from zipfile import ZipFile
  5
  6from django.core.management.base import BaseCommand
  7
  8from arches.app.models.models import Node, File
  9from arches.app.models.resource import Resource
 10from arches.app.models.graph import Graph
 11from arches.app.models.tile import Tile
 12
 13from arches_extensions.utils import ArchesHelpTextFormatter
 14
 15logger = logging.getLogger(__name__)
 16
 17class Command(BaseCommand):
 18    """Generate a list of file names for file-list nodes within the
 19specific resources.
 20
 21    .. warning::
 22        This command is a work-in-progress
 23
 24    Usage:
 25
 26        python manage.py get_files
 27
 28    Arguments:
 29
 30        - `--resource`: Id for single resource instance to include (optional)
 31        - `--graph`: Name of graph, all instances will be included (optional)
 32        - `--make-csv`: Exports a CSV list of all file info, named for graph or instance (default=False)
 33        - `--make-archive`: Creates a zip archive of all files, named for graph or instance (default=False)
 34        - `--include-orphans`: Includes File objects that are no longer in tile data, but do exist in the database (default=False)
 35
 36    """
 37
 38    def __init__(self, *args, **kwargs):
 39        self.help = self.__doc__
 40
 41    def add_arguments(self, parser):
 42
 43        parser.formatter_class = ArchesHelpTextFormatter
 44
 45        parser.add_argument("--resource")
 46        parser.add_argument("--graph")
 47        parser.add_argument("--make-csv", action="store_true")
 48        parser.add_argument("--make-archive", action="store_true")
 49        parser.add_argument("--include-orphans", action="store_true")
 50
 51    def handle(self, *args, **options):
 52
 53        scopes = []
 54
 55        ## get all relevant file objects
 56        if options["resource"]:
 57            scopes.append((options["resource"], File.objects.filter(tile__resourceinstance_id=scope)))
 58        elif options["graph"]:
 59            scopes.append((options["graph"], File.objects.filter(tile__resourceinstance__graph__name=options["graph"])))
 60        else:
 61            for graph in Graph.objects.filter(isresource=True).exclude(name="Arches System Settings"):
 62                scopes.append((graph.name, File.objects.filter(tile__resourceinstance__graph=graph)))
 63
 64        ## process all scopes
 65        for scope, files in scopes:
 66            print(f"Getting files for: {scope}")
 67            info = self.collect_file_info(files, include_orphans=options["include_orphans"])
 68            if options["make_csv"]:
 69                if len(info) == 0:
 70                    print("no data to write, skipping")
 71                else:
 72                    for i in info:
 73                        del i['file path']
 74                    csv_name = f"{scope}__filelist"
 75                    if options["include_orphans"]:
 76                        csv_name += "__withorphans"
 77                    with open(f"{csv_name}.csv", "w") as o:
 78                        writer = csv.DictWriter(o, fieldnames=info[0].keys())
 79                        writer.writeheader()
 80                        writer.writerows(info)
 81
 82            if options["make_archive"]:
 83                if len(info) == 0:
 84                    print("no files to archive, skipping")
 85                else:
 86                    usefiles = File.objects.filter(fileid__in=[i['file id'] for i in info])
 87                    zip_name = f"{scope}__files"
 88                    if options["include_orphans"]:
 89                        zip_name += "__withorphans"
 90                    with ZipFile(f"{zip_name}.zip", "w") as zip_file:
 91                        for f in usefiles:
 92                            with f.path.open("rb") as content:
 93                                zip_file.writestr(f.path.name.split("/")[-1], content.read())
 94
 95    def collect_file_info(self, files, include_orphans=False):
 96        print(f"File objects: {files.count()}")
 97
 98        ## quick check for missing files
 99        missing = []
100        for f in files:
101            if not f.path.storage.exists(f.path.name):
102                print("file missing:", f.path.path)
103                missing.append(f)
104        print(f"Missing files to be skipped: {len(missing)}")
105        files = [i for i in files if i not in missing]
106
107        ## iterate files and create a lookup for tiles they are referenced by
108        file_lookup = {}
109        tile_lookup = {}
110        for f in files:
111            file_lookup[str(f.fileid)] = f
112            tileid = str(f.tile_id)
113            if tileid not in tile_lookup:
114                tile_lookup[tileid] = Tile.objects.get(pk=tileid)
115        print(f"Tiles with files in them: {len(tile_lookup)}")
116
117        ## iterate all tiles that have been identified to have files,
118        ## and collect the file ids that are stored in the tile data
119        ## also make lookups for resources and nodes along the way for later use
120        node_lookup = {}
121        res_lookup = {}
122        file_info = []
123        files_without_id = []
124        orphan_total = 0
125        matched_total = 0
126        for t, tile in tile_lookup.items():
127            resid = str(tile.resourceinstance_id)
128            res = res_lookup.get(resid, Resource.objects.get(pk=resid))
129            if resid not in res_lookup:
130                res_lookup[resid] = res
131            found_ids = []
132            for k, v in tile.data.items():
133                node = node_lookup.get(k, Node.objects.get(pk=k))
134                if k not in node_lookup:
135                    node_lookup[k] = node
136                if node.datatype == "file-list":
137                    for i in v:
138                        id = str(i['file_id'])
139                        if id == "None":
140                            files_without_id.append(i)
141                            continue
142                        found_ids.append(id)
143                        matched_total += 1
144                        file_info.append({
145                            "resource id": resid,
146                            "resource name": res.displayname,
147                            "node name": node.name,
148                            "file id": id,
149                            "file name (original)": i['name'],
150                            "file name (actual)": Path(file_lookup[id].path.name).name,
151                            "file path": file_lookup[id].path.path,
152                        })
153            orphans = File.objects.filter(tile=tile).exclude(fileid__in=found_ids)
154            orphan_total += orphans.count()
155            if include_orphans:
156                for orphan in orphans:
157                    id = str(orphan.pk)
158                    file_info.append({
159                        "resource id": resid,
160                        "resource name": res.displayname,
161                        "node name": "<unknown>",
162                        "file id": id,
163                        "file name (original)": "<unknown>",
164                        "file name (actual)": Path(file_lookup[id].path.name).name,
165                        "file path": file_lookup[id].path.path,
166                    })
167
168        print(f"Files in tiles without fileids: {len(files_without_id)}")
169        file_info.sort(key=lambda x: x['resource name'])
170        print(f"Number of files actually referenced in tiles: {matched_total}")
171        print(f"Number of orphaned files: {orphan_total}")
172        return file_info

logger = <Logger arches_extensions.management.commands.get_files (WARNING)>

class Command(django.core.management.base.BaseCommand): View Source

 18class Command(BaseCommand):
 19    """Generate a list of file names for file-list nodes within the
 20specific resources.
 21
 22    .. warning::
 23        This command is a work-in-progress
 24
 25    Usage:
 26
 27        python manage.py get_files
 28
 29    Arguments:
 30
 31        - `--resource`: Id for single resource instance to include (optional)
 32        - `--graph`: Name of graph, all instances will be included (optional)
 33        - `--make-csv`: Exports a CSV list of all file info, named for graph or instance (default=False)
 34        - `--make-archive`: Creates a zip archive of all files, named for graph or instance (default=False)
 35        - `--include-orphans`: Includes File objects that are no longer in tile data, but do exist in the database (default=False)
 36
 37    """
 38
 39    def __init__(self, *args, **kwargs):
 40        self.help = self.__doc__
 41
 42    def add_arguments(self, parser):
 43
 44        parser.formatter_class = ArchesHelpTextFormatter
 45
 46        parser.add_argument("--resource")
 47        parser.add_argument("--graph")
 48        parser.add_argument("--make-csv", action="store_true")
 49        parser.add_argument("--make-archive", action="store_true")
 50        parser.add_argument("--include-orphans", action="store_true")
 51
 52    def handle(self, *args, **options):
 53
 54        scopes = []
 55
 56        ## get all relevant file objects
 57        if options["resource"]:
 58            scopes.append((options["resource"], File.objects.filter(tile__resourceinstance_id=scope)))
 59        elif options["graph"]:
 60            scopes.append((options["graph"], File.objects.filter(tile__resourceinstance__graph__name=options["graph"])))
 61        else:
 62            for graph in Graph.objects.filter(isresource=True).exclude(name="Arches System Settings"):
 63                scopes.append((graph.name, File.objects.filter(tile__resourceinstance__graph=graph)))
 64
 65        ## process all scopes
 66        for scope, files in scopes:
 67            print(f"Getting files for: {scope}")
 68            info = self.collect_file_info(files, include_orphans=options["include_orphans"])
 69            if options["make_csv"]:
 70                if len(info) == 0:
 71                    print("no data to write, skipping")
 72                else:
 73                    for i in info:
 74                        del i['file path']
 75                    csv_name = f"{scope}__filelist"
 76                    if options["include_orphans"]:
 77                        csv_name += "__withorphans"
 78                    with open(f"{csv_name}.csv", "w") as o:
 79                        writer = csv.DictWriter(o, fieldnames=info[0].keys())
 80                        writer.writeheader()
 81                        writer.writerows(info)
 82
 83            if options["make_archive"]:
 84                if len(info) == 0:
 85                    print("no files to archive, skipping")
 86                else:
 87                    usefiles = File.objects.filter(fileid__in=[i['file id'] for i in info])
 88                    zip_name = f"{scope}__files"
 89                    if options["include_orphans"]:
 90                        zip_name += "__withorphans"
 91                    with ZipFile(f"{zip_name}.zip", "w") as zip_file:
 92                        for f in usefiles:
 93                            with f.path.open("rb") as content:
 94                                zip_file.writestr(f.path.name.split("/")[-1], content.read())
 95
 96    def collect_file_info(self, files, include_orphans=False):
 97        print(f"File objects: {files.count()}")
 98
 99        ## quick check for missing files
100        missing = []
101        for f in files:
102            if not f.path.storage.exists(f.path.name):
103                print("file missing:", f.path.path)
104                missing.append(f)
105        print(f"Missing files to be skipped: {len(missing)}")
106        files = [i for i in files if i not in missing]
107
108        ## iterate files and create a lookup for tiles they are referenced by
109        file_lookup = {}
110        tile_lookup = {}
111        for f in files:
112            file_lookup[str(f.fileid)] = f
113            tileid = str(f.tile_id)
114            if tileid not in tile_lookup:
115                tile_lookup[tileid] = Tile.objects.get(pk=tileid)
116        print(f"Tiles with files in them: {len(tile_lookup)}")
117
118        ## iterate all tiles that have been identified to have files,
119        ## and collect the file ids that are stored in the tile data
120        ## also make lookups for resources and nodes along the way for later use
121        node_lookup = {}
122        res_lookup = {}
123        file_info = []
124        files_without_id = []
125        orphan_total = 0
126        matched_total = 0
127        for t, tile in tile_lookup.items():
128            resid = str(tile.resourceinstance_id)
129            res = res_lookup.get(resid, Resource.objects.get(pk=resid))
130            if resid not in res_lookup:
131                res_lookup[resid] = res
132            found_ids = []
133            for k, v in tile.data.items():
134                node = node_lookup.get(k, Node.objects.get(pk=k))
135                if k not in node_lookup:
136                    node_lookup[k] = node
137                if node.datatype == "file-list":
138                    for i in v:
139                        id = str(i['file_id'])
140                        if id == "None":
141                            files_without_id.append(i)
142                            continue
143                        found_ids.append(id)
144                        matched_total += 1
145                        file_info.append({
146                            "resource id": resid,
147                            "resource name": res.displayname,
148                            "node name": node.name,
149                            "file id": id,
150                            "file name (original)": i['name'],
151                            "file name (actual)": Path(file_lookup[id].path.name).name,
152                            "file path": file_lookup[id].path.path,
153                        })
154            orphans = File.objects.filter(tile=tile).exclude(fileid__in=found_ids)
155            orphan_total += orphans.count()
156            if include_orphans:
157                for orphan in orphans:
158                    id = str(orphan.pk)
159                    file_info.append({
160                        "resource id": resid,
161                        "resource name": res.displayname,
162                        "node name": "<unknown>",
163                        "file id": id,
164                        "file name (original)": "<unknown>",
165                        "file name (actual)": Path(file_lookup[id].path.name).name,
166                        "file path": file_lookup[id].path.path,
167                    })
168
169        print(f"Files in tiles without fileids: {len(files_without_id)}")
170        file_info.sort(key=lambda x: x['resource name'])
171        print(f"Number of files actually referenced in tiles: {matched_total}")
172        print(f"Number of orphaned files: {orphan_total}")
173        return file_info

Generate a list of file names for file-list nodes within the specific resources.



This command is a work-in-progress




Usage:

    python manage.py get_files

Arguments:

    - `--resource`: Id for single resource instance to include (optional)
    - `--graph`: Name of graph, all instances will be included (optional)
    - `--make-csv`: Exports a CSV list of all file info, named for graph or instance (default=False)
    - `--make-archive`: Creates a zip archive of all files, named for graph or instance (default=False)
    - `--include-orphans`: Includes File objects that are no longer in tile data, but do exist in the database (default=False)

Command(*args, **kwargs) View Source

39    def __init__(self, *args, **kwargs):
40        self.help = self.__doc__

help = ''

def add_arguments(self, parser): View Source

42    def add_arguments(self, parser):
43
44        parser.formatter_class = ArchesHelpTextFormatter
45
46        parser.add_argument("--resource")
47        parser.add_argument("--graph")
48        parser.add_argument("--make-csv", action="store_true")
49        parser.add_argument("--make-archive", action="store_true")
50        parser.add_argument("--include-orphans", action="store_true")

Entry point for subclassed commands to add custom arguments.

def handle(self, *args, **options): View Source

52    def handle(self, *args, **options):
53
54        scopes = []
55
56        ## get all relevant file objects
57        if options["resource"]:
58            scopes.append((options["resource"], File.objects.filter(tile__resourceinstance_id=scope)))
59        elif options["graph"]:
60            scopes.append((options["graph"], File.objects.filter(tile__resourceinstance__graph__name=options["graph"])))
61        else:
62            for graph in Graph.objects.filter(isresource=True).exclude(name="Arches System Settings"):
63                scopes.append((graph.name, File.objects.filter(tile__resourceinstance__graph=graph)))
64
65        ## process all scopes
66        for scope, files in scopes:
67            print(f"Getting files for: {scope}")
68            info = self.collect_file_info(files, include_orphans=options["include_orphans"])
69            if options["make_csv"]:
70                if len(info) == 0:
71                    print("no data to write, skipping")
72                else:
73                    for i in info:
74                        del i['file path']
75                    csv_name = f"{scope}__filelist"
76                    if options["include_orphans"]:
77                        csv_name += "__withorphans"
78                    with open(f"{csv_name}.csv", "w") as o:
79                        writer = csv.DictWriter(o, fieldnames=info[0].keys())
80                        writer.writeheader()
81                        writer.writerows(info)
82
83            if options["make_archive"]:
84                if len(info) == 0:
85                    print("no files to archive, skipping")
86                else:
87                    usefiles = File.objects.filter(fileid__in=[i['file id'] for i in info])
88                    zip_name = f"{scope}__files"
89                    if options["include_orphans"]:
90                        zip_name += "__withorphans"
91                    with ZipFile(f"{zip_name}.zip", "w") as zip_file:
92                        for f in usefiles:
93                            with f.path.open("rb") as content:
94                                zip_file.writestr(f.path.name.split("/")[-1], content.read())

The actual logic of the command. Subclasses must implement this method.

def collect_file_info(self, files, include_orphans=False): View Source

 96    def collect_file_info(self, files, include_orphans=False):
 97        print(f"File objects: {files.count()}")
 98
 99        ## quick check for missing files
100        missing = []
101        for f in files:
102            if not f.path.storage.exists(f.path.name):
103                print("file missing:", f.path.path)
104                missing.append(f)
105        print(f"Missing files to be skipped: {len(missing)}")
106        files = [i for i in files if i not in missing]
107
108        ## iterate files and create a lookup for tiles they are referenced by
109        file_lookup = {}
110        tile_lookup = {}
111        for f in files:
112            file_lookup[str(f.fileid)] = f
113            tileid = str(f.tile_id)
114            if tileid not in tile_lookup:
115                tile_lookup[tileid] = Tile.objects.get(pk=tileid)
116        print(f"Tiles with files in them: {len(tile_lookup)}")
117
118        ## iterate all tiles that have been identified to have files,
119        ## and collect the file ids that are stored in the tile data
120        ## also make lookups for resources and nodes along the way for later use
121        node_lookup = {}
122        res_lookup = {}
123        file_info = []
124        files_without_id = []
125        orphan_total = 0
126        matched_total = 0
127        for t, tile in tile_lookup.items():
128            resid = str(tile.resourceinstance_id)
129            res = res_lookup.get(resid, Resource.objects.get(pk=resid))
130            if resid not in res_lookup:
131                res_lookup[resid] = res
132            found_ids = []
133            for k, v in tile.data.items():
134                node = node_lookup.get(k, Node.objects.get(pk=k))
135                if k not in node_lookup:
136                    node_lookup[k] = node
137                if node.datatype == "file-list":
138                    for i in v:
139                        id = str(i['file_id'])
140                        if id == "None":
141                            files_without_id.append(i)
142                            continue
143                        found_ids.append(id)
144                        matched_total += 1
145                        file_info.append({
146                            "resource id": resid,
147                            "resource name": res.displayname,
148                            "node name": node.name,
149                            "file id": id,
150                            "file name (original)": i['name'],
151                            "file name (actual)": Path(file_lookup[id].path.name).name,
152                            "file path": file_lookup[id].path.path,
153                        })
154            orphans = File.objects.filter(tile=tile).exclude(fileid__in=found_ids)
155            orphan_total += orphans.count()
156            if include_orphans:
157                for orphan in orphans:
158                    id = str(orphan.pk)
159                    file_info.append({
160                        "resource id": resid,
161                        "resource name": res.displayname,
162                        "node name": "<unknown>",
163                        "file id": id,
164                        "file name (original)": "<unknown>",
165                        "file name (actual)": Path(file_lookup[id].path.name).name,
166                        "file path": file_lookup[id].path.path,
167                    })
168
169        print(f"Files in tiles without fileids: {len(files_without_id)}")
170        file_info.sort(key=lambda x: x['resource name'])
171        print(f"Number of files actually referenced in tiles: {matched_total}")
172        print(f"Number of orphaned files: {orphan_total}")
173        return file_info