forked from mxmlnkn/ratarmount
-
Notifications
You must be signed in to change notification settings - Fork 1
/
ratarmount.py
executable file
·2411 lines (2059 loc) · 107 KB
/
ratarmount.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import bisect
import collections
import io
import json
import os
import re
import sqlite3
import stat
import sys
import tarfile
import tempfile
import time
import traceback
from timeit import default_timer as timer
import typing
from typing import Any, AnyStr, BinaryIO, Dict, IO, Iterable, List, Optional, Set, Tuple, Union
import fuse
# Can't do this dynamically with importlib.import_module and using supportedCompressions
# because then the static checkers like mypy and pylint won't recognize the modules!
try:
import indexed_bzip2
except ImportError:
pass
try:
import indexed_gzip
except ImportError:
pass
try:
import indexed_zstd
except ImportError:
pass
try:
import lzmaffi
except ImportError:
pass
__version__ = '0.7.0'
# Defining lambdas does not yet check the names of entities used inside the lambda!
CompressionInfo = collections.namedtuple(
'CompressionInfo', ['suffixes', 'doubleSuffixes', 'moduleName', 'checkHeader', 'open']
)
supportedCompressions = {
'bz2': CompressionInfo(
['bz2', 'bzip2'],
['tb2', 'tbz', 'tbz2', 'tz2'],
'indexed_bzip2',
lambda x: (x.read(4)[:3] == b'BZh' and x.read(6) == (0x314159265359).to_bytes(6, 'big')),
lambda x: indexed_bzip2.IndexedBzip2File(x.fileno()),
),
'gz': CompressionInfo(
['gz', 'gzip'],
['taz', 'tgz'],
'indexed_gzip',
lambda x: x.read(2) == b'\x1F\x8B',
lambda x: indexed_gzip.IndexedGzipFile(fileobj=x),
),
'xz': CompressionInfo(
['xz'], ['txz'], 'lzmaffi', lambda x: x.read(6) == b"\xFD7zXZ\x00", lambda x: lzmaffi.open(x)
),
'zst': CompressionInfo(
['zst', 'zstd'],
['tzst'],
'indexed_zstd',
lambda x: x.read(4) == (0xFD2FB528).to_bytes(4, 'little'),
lambda x: indexed_zstd.IndexedZstdFile(x.fileno()),
),
}
def stripSuffixFromCompressedFile(path: str) -> str:
"""Strips compression suffixes like .bz2, .gz, ..."""
for compression in supportedCompressions.values():
for suffix in compression.suffixes:
if path.lower().endswith('.' + suffix.lower()):
return path[: -(len(suffix) + 1)]
return path
def stripSuffixFromTarFile(path: str) -> str:
"""Strips extensions like .tar.gz or .gz or .tgz, ..."""
# 1. Try for conflated suffixes first
for compression in supportedCompressions.values():
for suffix in compression.doubleSuffixes + ['t' + s for s in compression.suffixes]:
if path.lower().endswith('.' + suffix.lower()):
return path[: -(len(suffix) + 1)]
# 2. Remove compression suffixes
path = stripSuffixFromCompressedFile(path)
# 3. Remove .tar if we are left with it after the compression suffix removal
if path.lower().endswith('.tar'):
path = path[:-4]
return path
printDebug = 1
class RatarmountError(Exception):
"""Base exception for ratarmount module."""
class IndexNotOpenError(RatarmountError):
"""Exception for operations executed on a closed index database."""
class InvalidIndexError(RatarmountError):
"""Exception for indexes being invalid, outdated, or created with different arguments."""
class CompressionError(RatarmountError):
"""Exception for trying to open files with unsupported compression or unavailable decompression module."""
def overrides(parentClass):
"""Simple decorator that checks that a method with the same name exists in the parent class"""
def overrider(method):
assert method.__name__ in dir(parentClass)
assert callable(getattr(parentClass, method.__name__))
return method
return overrider
class ProgressBar:
"""Simple progress bar which keeps track of changes and prints the progress and a time estimate."""
def __init__(self, maxValue: float):
# fmt: off
self.maxValue = maxValue
self.lastUpdateTime = time.time()
self.lastUpdateValue = 0.
self.updateInterval = 2. # seconds
self.creationTime = time.time()
# fmt: on
def update(self, value: float) -> None:
"""Should be called whenever the monitored value changes. The progress bar is updated accordingly."""
if self.lastUpdateTime is not None and (time.time() - self.lastUpdateTime) < self.updateInterval:
return
# Use whole interval since start to estimate time
eta1 = int((time.time() - self.creationTime) / value * (self.maxValue - value))
# Use only a shorter window interval to estimate time.
# Accounts better for higher speeds in beginning, e.g., caused by caching effects.
# However, this estimate might vary a lot while the other one stabilizes after some time!
eta2 = int((time.time() - self.lastUpdateTime) / (value - self.lastUpdateValue) * (self.maxValue - value))
print(
"Currently at position {} of {} ({:.2f}%). "
"Estimated time remaining with current rate: {} min {} s, with average rate: {} min {} s.".format(
# fmt:off
value, self.maxValue, value / self.maxValue * 100.0,
eta2 // 60, eta2 % 60,
eta1 // 60, eta1 % 60
# fmt:on
),
flush=True,
)
self.lastUpdateTime = time.time()
self.lastUpdateValue = value
class StenciledFile(io.BufferedIOBase):
"""A file abstraction layer giving a stenciled view to an underlying file."""
def __init__(self, fileobj: IO, stencils: List[Tuple[int, int]]) -> None:
"""
stencils: A list tuples specifying the offset and length of the underlying file to use.
The order of these tuples will be kept.
The offset must be non-negative and the size must be positive.
Examples:
stencil = [(5,7)]
Makes a new 7B sized virtual file starting at offset 5 of fileobj.
stencil = [(0,3),(5,3)]
Make a new 6B sized virtual file containing bytes [0,1,2,5,6,7] of fileobj.
stencil = [(0,3),(0,3)]
Make a 6B size file containing the first 3B of fileobj twice concatenated together.
"""
# fmt: off
self.fileobj = fileobj
self.offsets = [x[0] for x in stencils]
self.sizes = [x[1] for x in stencils]
self.offset = 0
# fmt: on
# Calculate cumulative sizes
self.cumsizes = [0]
for offset, size in stencils:
assert offset >= 0
assert size > 0
self.cumsizes.append(self.cumsizes[-1] + size)
# Seek to the first stencil offset in the underlying file so that "read" will work out-of-the-box
self.seek(0)
def _findStencil(self, offset: int) -> int:
"""
Return index to stencil where offset belongs to. E.g., for stencils [(3,5),(8,2)], offsets 0 to
and including 4 will still be inside stencil (3,5), i.e., index 0 will be returned. For offset 6,
index 1 would be returned because it now is in the second contiguous region / stencil.
"""
# bisect_left( value ) gives an index for a lower range: value < x for all x in list[0:i]
# Because value >= 0 and list starts with 0 we can therefore be sure that the returned i>0
# Consider the stencils [(11,2),(22,2),(33,2)] -> cumsizes [0,2,4,6]. Seek to offset 2 should seek to 22.
assert offset >= 0
i = bisect.bisect_left(self.cumsizes, offset + 1) - 1
assert i >= 0
return i
@overrides(io.BufferedIOBase)
def close(self) -> None:
# Don't close the object given to us
# self.fileobj.close()
pass
@overrides(io.BufferedIOBase)
def fileno(self) -> int:
return self.fileobj.fileno()
@overrides(io.BufferedIOBase)
def seekable(self) -> bool:
return self.fileobj.seekable()
@overrides(io.BufferedIOBase)
def readable(self) -> bool:
return self.fileobj.readable()
@overrides(io.BufferedIOBase)
def writable(self) -> bool:
return False
@overrides(io.BufferedIOBase)
def read(self, size: int = -1) -> bytes:
if size == -1:
size = self.cumsizes[-1] - self.offset
# This loop works in a kind of leapfrog fashion. On each even loop iteration it seeks to the next stencil
# and on each odd iteration it reads the data and increments the offset inside the stencil!
result = b''
i = self._findStencil(self.offset)
while size > 0 and i < len(self.sizes):
# Read as much as requested or as much as the current contiguous region / stencil still contains
readableSize = min(size, self.sizes[i] - (self.offset - self.cumsizes[i]))
if readableSize == 0:
# Go to next stencil
i += 1
if i >= len(self.offsets):
break
self.fileobj.seek(self.offsets[i])
else:
# Actually read data
tmp = self.fileobj.read(readableSize)
self.offset += len(tmp)
result += tmp
size -= readableSize
# Now, either size is 0 or readableSize will be 0 in the next iteration
return result
@overrides(io.BufferedIOBase)
def seek(self, offset: int, whence: int = io.SEEK_SET) -> int:
if whence == io.SEEK_CUR:
self.offset += offset
elif whence == io.SEEK_END:
self.offset = self.cumsizes[-1] + offset
elif whence == io.SEEK_SET:
self.offset = offset
if self.offset < 0:
raise ValueError("Trying to seek before the start of the file!")
if self.offset >= self.cumsizes[-1]:
return self.offset
i = self._findStencil(self.offset)
offsetInsideStencil = self.offset - self.cumsizes[i]
assert offsetInsideStencil >= 0
assert offsetInsideStencil < self.sizes[i]
self.fileobj.seek(self.offsets[i] + offsetInsideStencil, io.SEEK_SET)
return self.offset
@overrides(io.BufferedIOBase)
def tell(self) -> int:
return self.offset
# Names must be identical to the SQLite column headers!
FileInfo = collections.namedtuple(
"FileInfo", "offsetheader offset size mtime mode type linkname uid gid istar issparse"
)
class SQLiteIndexedTar:
"""
This class reads once through the whole TAR archive and stores TAR file offsets
for all contained files in an index to support fast seeking to a given file.
"""
# Version 0.1.0:
# - Initial version
# Version 0.2.0:
# - Add sparse support and 'offsetheader' and 'issparse' columns to the SQLite database
# - Add TAR file size metadata in order to quickly check whether the TAR changed
# - Add 'offsetheader' to the primary key of the 'files' table so that files which were
# updated in the TAR can still be accessed if necessary.
# Version 0.3.0:
# - Add arguments influencing the created index to metadata (ignore-zeros, recursive, ...)
__version__ = '0.3.0'
def __init__(
# fmt: off
self,
tarFileName : Optional[str] = None,
fileObject : Optional[BinaryIO] = None,
writeIndex : bool = False,
clearIndexCache : bool = False,
indexFileName : Optional[str] = None,
indexFolders : Optional[List[str]] = None,
recursive : bool = False,
gzipSeekPointSpacing : int = 4*1024*1024,
encoding : str = tarfile.ENCODING,
stripRecursiveTarExtension : bool = False,
ignoreZeros : bool = False,
verifyModificationTime : bool = False,
# fmt: on
) -> None:
"""
tarFileName : Path to the TAR file to be opened. If not specified, a fileObject must be specified.
If only a fileObject is given, the created index can't be cached (efficiently).
fileObject : A io.IOBase derived object. If not specified, tarFileName will be opened.
If it is an instance of IndexedBzip2File, IndexedGzipFile, or IndexedZstdFile, then the offset
loading and storing from and to the SQLite database is managed automatically by this class.
encoding : Will be forwarded to tarfile. Specifies how filenames inside the TAR are encoded.
ignoreZeros : Will be forwarded to tarfile. Specifies to not only skip zero blocks but also blocks with
invalid data. Setting this to true can lead to some problems but is required to correctly
read concatenated tars.
stripRecursiveTarExtension : If true and if recursive is also true, then a <file>.tar inside the current
tar will be mounted at <file>/ instead of <file>.tar/.
"""
# stores which parent folders were last tried to add to database and therefore do exist
self.parentFolderCache: List[Tuple[str, str]] = []
self.sqlConnection: Optional[sqlite3.Connection] = None
# fmt: off
self.mountRecursively = recursive
self.encoding = encoding
self.stripRecursiveTarExtension = stripRecursiveTarExtension
self.ignoreZeros = ignoreZeros
self.verifyModificationTime = verifyModificationTime
self.gzipSeekPointSpacing = gzipSeekPointSpacing
# fmt: on
if not tarFileName:
if not fileObject:
raise ValueError("At least one of tarFileName and fileObject arguments should be set!")
self.tarFileName = '<file object>'
self._createIndex(fileObject)
# return here because we can't find a save location without any identifying name
return
self.tarFileName = os.path.abspath(tarFileName)
fileSize = None
if not fileObject:
fileObject = open(self.tarFileName, 'rb')
fileObject.seek(0, io.SEEK_END)
fileSize = fileObject.tell()
fileObject.seek(0)
# rawFileObject : Only set when opening a compressed file and only kept to keep the
# compressed file handle from being closed by the garbage collector.
# tarFileObject : File object to the uncompressed (or decompressed) TAR file to read actual data out of.
# compression : Stores what kind of compression the originally specified TAR file uses.
# isTar : Can be false for the degenerated case of only a bz2 or gz file not containing a TAR
self.tarFileObject, self.rawFileObject, self.compression, self.isTar = SQLiteIndexedTar._openCompressedFile(
fileObject, gzipSeekPointSpacing, encoding
)
if self.compression == 'xz':
try:
if len(self.tarFileObject.block_boundaries) <= 1 and (fileSize is None or fileSize > 1024 * 1024):
print("[Warning] The specified file '{}'".format(self.tarFileName))
print("[Warning] is compressed using xz but only contains one xz block. This makes it ")
print("[Warning] impossible to use true seeking! Please (re)compress your TAR using pixz")
print("[Warning] (see https://github.com/vasi/pixz) in order for ratarmount to do be able ")
print("[Warning] to do fast seeking to requested files.")
print("[Warning] As it is, each file access will decompress the whole TAR from the beginning!")
print()
except Exception:
pass
# will be used for storing indexes if current path is read-only
possibleIndexFilePaths = [self.tarFileName + ".index.sqlite"]
indexPathAsName = self.tarFileName.replace("/", "_") + ".index.sqlite"
if isinstance(indexFolders, str):
indexFolders = [indexFolders]
if indexFileName:
# A given index file name takes precedence and there should be no implicit fallback
possibleIndexFilePaths = [os.path.abspath(os.path.expanduser(indexFileName))]
elif indexFolders:
# An empty path is to be interpreted as the default path right besides the TAR
if '' not in indexFolders:
possibleIndexFilePaths = []
for folder in indexFolders:
if folder:
indexPath = os.path.join(folder, indexPathAsName)
possibleIndexFilePaths.append(os.path.abspath(os.path.expanduser(indexPath)))
self.indexFileName = None
if clearIndexCache:
for indexPath in possibleIndexFilePaths:
if os.path.isfile(indexPath):
os.remove(indexPath)
# Try to find an already existing index
for indexPath in possibleIndexFilePaths:
if self._tryLoadIndex(indexPath):
self.indexFileName = indexPath
break
if self.indexIsLoaded():
self._loadOrStoreCompressionOffsets()
return
# Find a suitable (writable) location for the index database
if writeIndex:
for indexPath in possibleIndexFilePaths:
if self._pathIsWritable(indexPath) and self._pathCanBeUsedForSqlite(indexPath):
self.indexFileName = indexPath
break
if not self.indexFileName:
raise InvalidIndexError(
"Could not find any existing index or writable location for an index in " + str(possibleIndexFilePaths)
)
self._createIndex(self.tarFileObject)
self._loadOrStoreCompressionOffsets() # store
if self.sqlConnection:
self._storeMetadata(self.sqlConnection)
if printDebug >= 1 and writeIndex:
# The 0-time is legacy for the automated tests
# fmt: off
print("Writing out TAR index to", self.indexFileName, "took 0s",
"and is sized", os.stat( self.indexFileName ).st_size, "B")
# fmt: on
def _storeMetadata(self, connection: sqlite3.Connection) -> None:
self._storeVersionsMetadata(connection)
metadataTable = """
/* empty table whose sole existence specifies that we finished iterating the tar */
CREATE TABLE "metadata" (
"key" VARCHAR(65535) NOT NULL, /* e.g. "tarsize" */
"value" VARCHAR(65535) NOT NULL /* e.g. size in bytes as integer */
);
"""
connection.executescript(metadataTable)
# All of these require the generic "metadata" table.
self._storeTarMetadata(connection, self.tarFileName)
self._storeArgumentsMetadata(connection)
connection.commit()
@staticmethod
def _storeVersionsMetadata(connection: sqlite3.Connection) -> None:
versionsTable = """
/* This table's sole existence specifies that we finished iterating the tar for older ratarmount versions */
CREATE TABLE "versions" (
"name" VARCHAR(65535) NOT NULL, /* which component the version belongs to */
"version" VARCHAR(65535) NOT NULL, /* free form version string */
/* Semantic Versioning 2.0.0 (semver.org) parts if they can be specified:
* MAJOR version when you make incompatible API changes,
* MINOR version when you add functionality in a backwards compatible manner, and
* PATCH version when you make backwards compatible bug fixes. */
"major" INTEGER,
"minor" INTEGER,
"patch" INTEGER
);
"""
try:
connection.executescript(versionsTable)
except Exception as exception:
if printDebug >= 2:
print(exception)
print("[Warning] There was an error when adding metadata information. Index loading might not work.")
try:
def makeVersionRow(
versionName: str, version: str
) -> Tuple[str, str, Optional[str], Optional[str], Optional[str]]:
versionNumbers = [re.sub('[^0-9]', '', x) for x in version.split('.')]
return (
versionName,
version,
versionNumbers[0] if len(versionNumbers) > 0 else None,
versionNumbers[1] if len(versionNumbers) > 1 else None,
versionNumbers[2] if len(versionNumbers) > 2 else None,
)
versions = [
makeVersionRow('ratarmount', __version__),
makeVersionRow('index', SQLiteIndexedTar.__version__),
]
for _, cinfo in supportedCompressions.items():
if cinfo.moduleName in globals():
versions += [makeVersionRow(cinfo.moduleName, globals()[cinfo.moduleName].__version__)]
connection.executemany('INSERT OR REPLACE INTO "versions" VALUES (?,?,?,?,?)', versions)
except Exception as exception:
print("[Warning] There was an error when adding version information.")
if printDebug >= 3:
print(exception)
@staticmethod
def _storeTarMetadata(connection: sqlite3.Connection, tarPath: AnyStr) -> None:
"""Adds some consistency meta information to recognize the need to update the cached TAR index"""
try:
tarStats = os.stat(tarPath)
serializedTarStats = json.dumps(
{attr: getattr(tarStats, attr) for attr in dir(tarStats) if attr.startswith('st_')}
)
connection.execute('INSERT INTO "metadata" VALUES (?,?)', ("tarstats", serializedTarStats))
except Exception as exception:
if printDebug >= 2:
print(exception)
print("[Warning] There was an error when adding file metadata.")
print("[Warning] Automatic detection of changed TAR files during index loading might not work.")
def _storeArgumentsMetadata(self, connection: sqlite3.Connection) -> None:
argumentsToSave = [
'mountRecursively',
'gzipSeekPointSpacing',
'encoding',
'stripRecursiveTarExtension',
'ignoreZeros',
]
argumentsMetadata = json.dumps({argument: getattr(self, argument) for argument in argumentsToSave})
try:
connection.execute('INSERT INTO "metadata" VALUES (?,?)', ("arguments", argumentsMetadata))
except Exception as exception:
if printDebug >= 2:
print(exception)
print("[Warning] There was an error when adding argument metadata.")
print("[Warning] Automatic detection of changed arguments files during index loading might not work.")
@staticmethod
def _pathIsWritable(path: AnyStr) -> bool:
try:
folder = os.path.dirname(path)
if folder:
os.makedirs(folder, exist_ok=True)
f = open(path, 'wb')
f.write(b'\0' * 1024 * 1024)
f.close()
os.remove(path)
return True
except IOError:
if printDebug >= 2:
traceback.print_exc()
print("Could not create file:", path)
return False
@staticmethod
def _pathCanBeUsedForSqlite(path: AnyStr) -> bool:
fileExisted = os.path.isfile(path)
try:
folder = os.path.dirname(path)
if folder:
os.makedirs(folder, exist_ok=True)
connection = SQLiteIndexedTar._openSqlDb(path)
connection.executescript('CREATE TABLE "files" ( "path" VARCHAR(65535) NOT NULL );')
connection.commit()
connection.close()
return True
except sqlite3.OperationalError:
if printDebug >= 2:
traceback.print_exc()
print("Could not create SQLite database at:", path)
finally:
if not fileExisted and os.path.isfile(path):
SQLiteIndexedTar._uncheckedRemove(path)
return False
@staticmethod
def _openSqlDb(path: AnyStr) -> sqlite3.Connection:
sqlConnection = sqlite3.connect(path)
sqlConnection.row_factory = sqlite3.Row
sqlConnection.executescript(
"""
PRAGMA LOCKING_MODE = EXCLUSIVE;
PRAGMA TEMP_STORE = MEMORY;
PRAGMA JOURNAL_MODE = OFF;
PRAGMA SYNCHRONOUS = OFF;
"""
)
return sqlConnection
@staticmethod
def _initializeSqlDb(indexFileName: Optional[str]) -> sqlite3.Connection:
if printDebug >= 1:
print("Creating new SQLite index database at", indexFileName)
createTables = """
CREATE TABLE "files" (
"path" VARCHAR(65535) NOT NULL,
"name" VARCHAR(65535) NOT NULL,
"offsetheader" INTEGER, /* seek offset from TAR file where these file's contents resides */
"offset" INTEGER, /* seek offset from TAR file where these file's contents resides */
"size" INTEGER,
"mtime" INTEGER,
"mode" INTEGER,
"type" INTEGER,
"linkname" VARCHAR(65535),
"uid" INTEGER,
"gid" INTEGER,
/* True for valid TAR files. Internally used to determine where to mount recursive TAR files. */
"istar" BOOL ,
"issparse" BOOL , /* for sparse files the file size refers to the expanded size! */
/* See SQL benchmarks for decision on the primary key.
* See also https://www.sqlite.org/optoverview.html
* (path,name) tuples might appear multiple times in a TAR if it got updated.
* In order to also be able to show older versions, we need to add
* the offsetheader column to the primary key. */
PRIMARY KEY (path,name,offsetheader)
);
/* "A table created using CREATE TABLE AS has no PRIMARY KEY and no constraints of any kind"
* Therefore, it will not be sorted and inserting will be faster! */
CREATE TABLE "filestmp" AS SELECT * FROM "files" WHERE 0;
CREATE TABLE "parentfolders" (
"path" VARCHAR(65535) NOT NULL,
"name" VARCHAR(65535) NOT NULL,
PRIMARY KEY (path,name)
);
"""
sqlConnection = SQLiteIndexedTar._openSqlDb(indexFileName if indexFileName else ':memory:')
tables = sqlConnection.execute('SELECT name FROM sqlite_master WHERE type = "table";')
if {"files", "filestmp", "parentfolders"}.intersection({t[0] for t in tables}):
raise InvalidIndexError(
"The index file {} already seems to contain a table. "
"Please specify --recreate-index.".format(indexFileName)
)
sqlConnection.executescript(createTables)
return sqlConnection
@staticmethod
def _tarInfoFullMode(tarInfo: tarfile.TarInfo) -> int:
"""
Returns the full mode for a TarInfo object. Note that TarInfo.mode only contains the permission bits
and not other bits like set for directory, symbolic links, and other special files.
"""
return (
tarInfo.mode
# fmt: off
| ( stat.S_IFDIR if tarInfo.isdir () else 0 )
| ( stat.S_IFREG if tarInfo.isfile() else 0 )
| ( stat.S_IFLNK if tarInfo.issym () else 0 )
| ( stat.S_IFCHR if tarInfo.ischr () else 0 )
| ( stat.S_IFIFO if tarInfo.isfifo() else 0 )
# fmt: on
)
def _updateProgressBar(self, progressBar, fileobj: Any) -> None:
try:
if hasattr(fileobj, 'tell_compressed') and self.compression == 'bz2':
# Note that because bz2 works on a bitstream the tell_compressed returns the offset in bits
progressBar.update(fileobj.tell_compressed() // 8)
elif hasattr(fileobj, 'tell_compressed'):
progressBar.update(fileobj.tell_compressed())
elif hasattr(fileobj, 'fileobj'):
progressBar.update(fileobj.fileobj().tell())
elif self.rawFileObject and hasattr(self.rawFileObject, 'tell'):
progressBar.update(self.rawFileObject.tell())
else:
progressBar.update(fileobj.tell())
except Exception:
pass
def _createIndex(
self,
# fmt: off
fileObject : Any,
progressBar : Any = None,
pathPrefix : str = '',
streamOffset: int = 0
# fmt: on
) -> None:
if printDebug >= 1:
print(
"Creating offset dictionary for",
"<file object>" if self.tarFileName is None else self.tarFileName,
"...",
)
t0 = timer()
# 1. If no SQL connection was given (by recursive call), open a new database file
openedConnection = False
if not self.indexIsLoaded() or not self.sqlConnection:
openedConnection = True
self.sqlConnection = self._initializeSqlDb(self.indexFileName)
# 2. Open TAR file reader
loadedTarFile: Any = [] # Feign an empty TAR file if anything goes wrong
if self.isTar:
try:
# r: uses seeks to skip to the next file inside the TAR while r| doesn't do any seeks.
# r| might be slower but for compressed files we have to go over all the data once anyways.
# Note that with ignore_zeros = True, no invalid header issues or similar will be raised even for
# non TAR files!?
loadedTarFile = tarfile.open(
# fmt:off
fileobj = fileObject,
mode = 'r|' if self.compression else 'r:',
ignore_zeros = self.ignoreZeros,
encoding = self.encoding,
# fmt:on
)
except tarfile.ReadError:
pass
if progressBar is None:
progressBar = ProgressBar(os.fstat(fileObject.fileno()).st_size)
# 3. Iterate over files inside TAR and add them to the database
try:
filesToMountRecursively = []
for tarInfo in loadedTarFile:
loadedTarFile.members = [] # Clear this in order to limit memory usage by tarfile
self._updateProgressBar(progressBar, fileObject)
# Add a leading '/' as a convention where '/' represents the TAR root folder
# Partly, done because fusepy specifies paths in a mounted directory like this
# os.normpath does not delete duplicate '/' at beginning of string!
fullPath = pathPrefix + "/" + os.path.normpath(tarInfo.name).lstrip('/')
path, name = fullPath.rsplit("/", 1)
# fmt: off
fileInfo = (
path , # 0
name , # 1
streamOffset + tarInfo.offset , # 2
streamOffset + tarInfo.offset_data, # 3
tarInfo.size , # 4
tarInfo.mtime , # 5
self._tarInfoFullMode(tarInfo) , # 6
tarInfo.type , # 7
tarInfo.linkname , # 8
tarInfo.uid , # 9
tarInfo.gid , # 10
False , # 11 (isTar)
tarInfo.issparse() , # 12
)
# fmt: on
if self.mountRecursively and tarInfo.isfile() and tarInfo.name.lower().endswith('.tar'):
filesToMountRecursively.append(fileInfo)
else:
self._setFileInfo(fileInfo)
except tarfile.ReadError as e:
if 'unexpected end of data' in str(e):
print(
"[Warning] The TAR file is incomplete. Ratarmount will work but some files might be cut off. "
"If the TAR file size changes, ratarmount will recreate the index during the next mounting."
)
# 4. Open contained TARs for recursive mounting
oldPos = fileObject.tell()
oldPrintName = self.tarFileName
for fileInfo in filesToMountRecursively:
tarExtension = '.tar'
fullPath = os.path.join(fileInfo[0], fileInfo[1])
if (
self.stripRecursiveTarExtension
and len(tarExtension) > 0
and fullPath.lower().endswith(tarExtension.lower())
):
modifiedFullPath = fullPath[: -len(tarExtension)]
else:
modifiedFullPath = fullPath
# Temporarily change tarFileName for the info output of the recursive call
self.tarFileName = fullPath
# StenciledFile's tell returns the offset inside the file chunk instead of the global one,
# so we have to always communicate the offset of this chunk to the recursive call no matter
# whether tarfile has streaming access or seeking access!
globalOffset = fileInfo[3]
size = fileInfo[4]
tarFileObject = StenciledFile(fileObject, [(globalOffset, size)])
isTar = False
try:
self._createIndex(tarFileObject, progressBar, modifiedFullPath, globalOffset)
isTar = True
except tarfile.ReadError:
pass
finally:
del tarFileObject
if isTar:
modifiedFileInfo = list(fileInfo)
# if the TAR file contents could be read, we need to adjust the actual
# TAR file's metadata to be a directory instead of a file
mode = modifiedFileInfo[6]
mode = (
(mode & 0o777)
| stat.S_IFDIR
| (stat.S_IXUSR if mode & stat.S_IRUSR != 0 else 0)
| (stat.S_IXGRP if mode & stat.S_IRGRP != 0 else 0)
| (stat.S_IXOTH if mode & stat.S_IROTH != 0 else 0)
)
path, name = modifiedFullPath.rsplit("/", 1)
modifiedFileInfo[0] = path
modifiedFileInfo[1] = name
modifiedFileInfo[6] = mode
modifiedFileInfo[11] = isTar
self._setFileInfo(tuple(modifiedFileInfo))
else:
self._setFileInfo(fileInfo)
fileObject.seek(oldPos)
self.tarFileName = oldPrintName
# Everything below should not be done in a recursive call of createIndex
if streamOffset > 0:
t1 = timer()
if printDebug >= 1:
print(
"Creating offset dictionary for",
"<file object>" if self.tarFileName is None else self.tarFileName,
"took {:.2f}s".format(t1 - t0),
)
return
# If no file is in the TAR, then it most likely indicates a possibly compressed non TAR file.
# In that case add that itself to the file index. This won't work when called recursively,
# so check stream offset.
fileCount = self.sqlConnection.execute('SELECT COUNT(*) FROM "files";').fetchone()[0]
if fileCount == 0:
tarInfo = os.fstat(fileObject.fileno())
fname = os.path.basename(self.tarFileName)
for suffix in ['.gz', '.bz2', '.bzip2', '.gzip', '.xz', '.zst', '.zstd']:
if fname.lower().endswith(suffix) and len(fname) > len(suffix):
fname = fname[: -len(suffix)]
break
# If the file object is actually an IndexedBzip2File or such, we can't directly use the file size
# from os.stat and instead have to gather it from seek. Unfortunately, indexed_gzip does not support
# io.SEEK_END even though it could as it has the index ...
while fileObject.read(1024 * 1024):
self._updateProgressBar(progressBar, fileObject)
fileSize = fileObject.tell()
# fmt: off
fileInfo = (
"" , # 0 path
fname , # 1
None , # 2 header offset
0 , # 3 data offset
fileSize , # 4
tarInfo.st_mtime , # 5
tarInfo.st_mode , # 6
None , # 7 TAR file type. Currently unused but overlaps with mode anyways
None , # 8 linkname
tarInfo.st_uid , # 9
tarInfo.st_gid , # 10
False , # 11 isTar
False , # 12 isSparse, don't care if it is actually sparse or not because it is not in TAR
)
# fmt: on
self._setFileInfo(fileInfo)
# All the code below is for database finalizing which should not be done in a recursive call of createIndex!
if not openedConnection:
return
# 5. Resort by (path,name). This one-time resort is faster than resorting on each INSERT (cache spill)
if printDebug >= 2:
print("Resorting files by path ...")
cleanupDatabase = """
INSERT OR REPLACE INTO "files" SELECT * FROM "filestmp" ORDER BY "path","name",rowid;
DROP TABLE "filestmp";
INSERT OR IGNORE INTO "files"
/* path name offsetheader offset size mtime mode type linkname uid gid istar issparse */
SELECT path,name,0,0,1,0,{},{},"",0,0,0,0
FROM "parentfolders" ORDER BY "path","name";
DROP TABLE "parentfolders";
""".format(
int(0o555 | stat.S_IFDIR), int(tarfile.DIRTYPE)
)
self.sqlConnection.executescript(cleanupDatabase)
self.sqlConnection.commit()
t1 = timer()
if printDebug >= 1:
print(
"Creating offset dictionary for",
"<file object>" if self.tarFileName is None else self.tarFileName,
"took {:.2f}s".format(t1 - t0),
)
@staticmethod
def _rowToFileInfo(row: Dict[str, Any]) -> FileInfo:
return FileInfo(
# fmt: off
offset = row['offset'],
offsetheader = row['offsetheader'] if 'offsetheader' in row.keys() else 0,
size = row['size'],
mtime = row['mtime'],
mode = row['mode'],
type = row['type'],
linkname = row['linkname'],
uid = row['uid'],
gid = row['gid'],
istar = row['istar'],
issparse = row['issparse'] if 'issparse' in row.keys() else False
# fmt: on
)
def getFileInfo(
self,
# fmt: off
fullPath : str,
listDir : bool = False,
listVersions : bool = False,
fileVersion : int = 0
# fmt: on
) -> Optional[Union[FileInfo, Dict[str, FileInfo]]]:
"""
This is the heart of this class' public interface!
path : full path to file where '/' denotes TAR's root, e.g., '/', or '/foo'
listDir : if True, return a dictionary for the given directory path: { fileName : FileInfo, ... }
if False, return simple FileInfo to given path (directory or file)
fileVersion : If the TAR contains the same file path multiple times, by default only the last one is shown.
But with this argument other versions can be queried. Version 1 is the oldest one.
Version 0 translates to the most recent one for compatibility with tar --occurrence=<number>.
Version -1 translates to the second most recent, and so on.
For listDir=True, the file version makes no sense and is ignored!
So, even if a folder was overwritten by a file, which is already not well supported by tar,
then listDir for that path will still list all contents of the overwritten folder or folders,
no matter the specified version. The file system layer has to take care that a directory
listing is not even requeted in the first place if it is not a directory.
FUSE already does this by calling getattr for all parent folders in the specified path first.
If path does not exist, always return None
If listVersions is true, then return metadata for all versions of a file possibly appearing more than once
in the TAR as a directory dictionary. listDir will then be ignored!
"""
# TODO cache last listDir as most often a stat over all entries will soon follow
if not isinstance(fileVersion, int):
raise TypeError("The specified file version must be an integer!")
if not self.sqlConnection:
raise IndexNotOpenError("This method can not be called without an opened index database!")
# also strips trailing '/' except for a single '/' and leading '/'
fullPath = '/' + os.path.normpath(fullPath).lstrip('/')
if listVersions:
path, name = fullPath.rsplit('/', 1)
rows = self.sqlConnection.execute(
'SELECT * FROM "files" WHERE "path" == (?) AND "name" == (?) ORDER BY "offsetheader" ASC', (path, name)
)
result = {str(version + 1): self._rowToFileInfo(row) for version, row in enumerate(rows)}
return result