diff --git a/config.sh b/config.sh index c03729925..51c58d3b7 100755 --- a/config.sh +++ b/config.sh @@ -24,9 +24,11 @@ function pre_build { if [ -n "$IS_OSX" ]; then sudo make install; sudo cp -r ../external/mpark/mpark /usr/local/include; + sudo cp -r ../external/mio/mio /usr/local/include; else make install; cp -r ../external/mpark/mpark /usr/local/include; + cp -r ../external/mio/mio /usr/local/include; fi popd diff --git a/lib/extension/dlisio/ext/io.hpp b/lib/extension/dlisio/ext/io.hpp index d46b6164f..f64b8569b 100644 --- a/lib/extension/dlisio/ext/io.hpp +++ b/lib/extension/dlisio/ext/io.hpp @@ -7,6 +7,8 @@ #include #include +#include + #include namespace dl { @@ -59,7 +61,14 @@ struct stream_offsets { void resize( std::size_t ) noexcept (false); }; -stream_offsets findoffsets( const std::string& path ) noexcept (false); +void map_source( mio::mmap_source&, const std::string& ) noexcept (false); + +long long findsul( mio::mmap_source& file ) noexcept (false); +long long findvrl( mio::mmap_source& path, long long from ) noexcept (false); + +stream_offsets findoffsets( mio::mmap_source& path, + long long from ) +noexcept (false); } diff --git a/lib/extension/dlisio/ext/types.hpp b/lib/extension/dlisio/ext/types.hpp index d7eca95ae..0e18e2fca 100644 --- a/lib/extension/dlisio/ext/types.hpp +++ b/lib/extension/dlisio/ext/types.hpp @@ -23,6 +23,12 @@ struct not_implemented : public std::logic_error { {} }; +struct not_found : public std::runtime_error { + explicit not_found( const std::string& msg ) + : runtime_error( msg ) + {} +}; + enum class representation_code : std::uint8_t { fshort = DLIS_FSHORT, fsingl = DLIS_FSINGL, diff --git a/lib/src/dlisio.cpp b/lib/src/dlisio.cpp index f3091bcb5..21b04c8d5 100644 --- a/lib/src/dlisio.cpp +++ b/lib/src/dlisio.cpp @@ -717,6 +717,15 @@ int dlis_index_records( const char* begin, if (end - DLIS_VRL_SIZE < ptr) return DLIS_TRUNCATED; + /* + * 2.3.6.4 Minimum Visible Record Length + * Since record segments must be at least 16 bytes, the + * effective minimum length for a visible record is 20 bytes + * (including itself), so anything less than that means + * corrupted data + */ + if (len < 20) return DLIS_UNEXPECTED_VALUE; + remaining = len - DLIS_VRL_SIZE; ptr += DLIS_VRL_SIZE; } @@ -728,6 +737,8 @@ int dlis_index_records( const char* begin, if (end - len < ptr) return DLIS_TRUNCATED; + if (len < 16) return DLIS_UNEXPECTED_VALUE; + ptr += len; remaining -= len; diff --git a/lib/src/io.cpp b/lib/src/io.cpp index 2402b0fc9..22915ac69 100644 --- a/lib/src/io.cpp +++ b/lib/src/io.cpp @@ -24,18 +24,126 @@ void stream_offsets::resize( std::size_t n ) noexcept (false) { this->explicits.resize( n ); } -stream_offsets findoffsets( const std::string& path ) noexcept (false) { +void map_source( mio::mmap_source& file, const std::string& path ) noexcept (false) { std::error_code syserror; - mio::mmap_source file; file.map( path, 0, mio::map_entire_file, syserror ); if (syserror) throw std::system_error( syserror ); + if (file.size() == 0) + throw std::invalid_argument( "non-existent or empty file" ); +} - const auto* begin = file.data() + 80; - const auto* end = file.data() + file.size(); +long long findsul( mio::mmap_source& file ) noexcept (false) { + /* + * search at most 200 bytes, looking for the SUL + * + * if it doesn't show up by then it's probably not there, or require other + * information + * + * Return the offset of the _first byte_ of the SUL. In a conforming file, + * this is 0. + */ + static const auto needle = "RECORD"; + static const std::size_t search_limit = 200; + + const auto first = file.data(); + const auto last = first + (std::min)( file.size(), search_limit ); + auto itr = std::search( first, last, needle, needle + 6 ); + + if (itr == last) { + const std::string msg = "searched " + + std::to_string(search_limit) + + " bytes, but could not find SUL" + ; + throw dl::not_found( msg ); + } - if (file.size() == 0) - throw std::invalid_argument( "empty file" ); + /* + * Before the structure field of the SUL there should be 10 bytes, i.e. + * sequence-number and DLIS version. + */ + const auto structure_offset = 9; + + if (std::distance( first, itr ) < structure_offset) { + auto pos = std::distance( first, itr ); + const std::string msg = "found 'RECORD' at pos = " + + std::to_string( pos ) + + ", but expected pos >= 10" + ; + throw std::runtime_error( msg ); + } + + return std::distance( file.data(), itr - structure_offset ); +} + +long long findvrl( mio::mmap_source& file, long long from ) noexcept (false) { + /* + * The first VRL does sometimes not immediately follow the SUL (or whatever + * came before it), but according to spec it should be a triple of + * (len,0xFF,0x01), where len is a UNORM. The second half shouldn't change, + * so look for the first occurence of that. + * + * If that too doesn't work then the file is likely too corrupted to read + * without manual intervention + */ + + if (from < 0) { + std::stringstream msg; + msg << "from (which is " << from << ") >= 0"; + throw std::out_of_range(msg.str()); + } + + if (std::size_t(from) > file.size()) { + std::stringstream msg; + msg << "from (which is " << from << ") " + << "<= file.size (which is " << file.size() << ")" + ; + throw std::out_of_range(msg.str()); + } + + static const unsigned char needle[] = { 0xFF, 0x01 }; + static const auto search_limit = 200; + + const auto limit = std::min< long long >(file.size() - from, search_limit); + + /* + * reinterpret the bytes as usigned char*. This is compatible and fine. + * + * When operator == is ued on the elements, they'll otherwise be promoted + * to int, so all of a sudden (char)0xFF != (unsigned char)0xFF. Forcing + * the pointer to be unsigend char fixes this issue. + */ + const auto front = reinterpret_cast< const unsigned char* >(file.data()); + const auto first = front + from; + const auto last = first + limit; + const auto itr = std::search(first, last, needle, needle + sizeof(needle)); + + if (itr == last) { + std::stringstream msg; + msg << "searched " << limit << " bytes, but could not find VRL"; + throw dl::not_found( msg.str() ); + } + + /* + * Before the 0xFF 0x01 there should be room for at least an unorm + */ + if (std::distance( first, itr ) < DLIS_SIZEOF_UNORM) { + auto pos = std::distance( first, itr ); + std::stringstream msg; + msg << "found 0xFF 0x01 at pos = " << from + pos + << ", but expected pos >= " << from + DLIS_SIZEOF_UNORM + ; + throw std::runtime_error(msg.str()); + } + + return std::distance(front, itr - DLIS_SIZEOF_UNORM); +} + +stream_offsets findoffsets( mio::mmap_source& file, long long from ) +noexcept (false) +{ + const auto* begin = file.data() + from; + const auto* end = file.data() + file.size(); // by default, assume ~4K per segment on average. This should be fairly few // reallocations, without overshooting too much @@ -63,14 +171,27 @@ stream_offsets findoffsets( const std::string& path ) noexcept (false) { count + residuals.data(), count + explicits.data() ); - if (err == DLIS_TRUNCATED) - throw std::runtime_error( "file truncated" ); + switch (err) { + case DLIS_OK: break; - if (err == DLIS_INCONSISTENT) - throw std::runtime_error( "inconsistensies in record sizes" ); + case DLIS_TRUNCATED: + throw std::runtime_error( "file truncated" ); - if (err) - throw std::runtime_error( "unknown error " + std::to_string( err ) ); + case DLIS_INCONSISTENT: + throw std::runtime_error( "inconsistensies in record sizes" ); + + case DLIS_UNEXPECTED_VALUE: { + std::stringstream msg; + // TODO: interrogate more? + msg << "record-length in record " << count << " corrupted"; + throw std::runtime_error(msg.str()); + } + + default: + throw std::runtime_error( + "unknown error " + std::to_string( err ) + ); + } if (next == end) break; diff --git a/python/data/pre-sul-garbage.dlis b/python/data/pre-sul-garbage.dlis new file mode 100644 index 000000000..42804f874 Binary files /dev/null and b/python/data/pre-sul-garbage.dlis differ diff --git a/python/data/pre-sul-pre-vrl-garbage.dlis b/python/data/pre-sul-pre-vrl-garbage.dlis new file mode 100644 index 000000000..e69e7af1b Binary files /dev/null and b/python/data/pre-sul-pre-vrl-garbage.dlis differ diff --git a/python/dlisio/__init__.py b/python/dlisio/__init__.py index 903316cc7..1067c13ee 100644 --- a/python/dlisio/__init__.py +++ b/python/dlisio/__init__.py @@ -9,11 +9,12 @@ pass class dlis(object): - def __init__(self, stream, explicits): + def __init__(self, stream, explicits, sul_offset = 80): self.file = stream self.explicit_indices = explicits self.object_sets = None self._objects = Objectpool(self.objectsets()) + self.sul_offset = sul_offset def __enter__(self): return self @@ -22,7 +23,7 @@ def __exit__(self, type, value, traceback): self.file.close() def storage_label(self): - blob = self.file.get(bytearray(80), 0, 80) + blob = self.file.get(bytearray(80), self.sul_offset, 80) return core.storage_label(blob) def objectsets(self, reload = False): @@ -116,14 +117,52 @@ def unknowns(self): return self._objects.unknowns def open(path): - tells, residuals, explicits = core.findoffsets(path) + """ Open a file + + Open a low-level file handle. This is not intended for end-users - rather, + it's an escape hatch for very broken files that dlisio cannot handle. + + Parameters + ---------- + path : str_like + + Returns + ------- + stream : dlisio.core.stream + + See Also + -------- + dlisio.load + """ + return core.stream(str(path)) + +def load(path): + """ Load a file + + Parameters + ---------- + path : str_like + + Returns + ------- + dlis : dlisio.dlis + """ + path = str(path) + + mmap = core.mmap_source() + mmap.map(path) + + sulpos = core.findsul(mmap) + vrlpos = core.findvrl(mmap, sulpos + 80) + + tells, residuals, explicits = core.findoffsets(mmap, vrlpos) explicits = [i for i, explicit in enumerate(explicits) if explicit != 0] - stream = core.stream(path) + stream = open(path) try: stream.reindex(tells, residuals) - f = dlis(stream, explicits) + f = dlis(stream, explicits, sul_offset = sulpos) except: stream.close() raise diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index cf2c7844a..62bfca3fa 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -361,13 +361,23 @@ PYBIND11_MODULE(core, m) { return objects; }); - m.def( "findoffsets", []( const std::string& path ) { - const auto ofs = dl::findoffsets( path ); + py::class_< mio::mmap_source >( m, "mmap_source" ) + .def( py::init<>() ) + .def( "map", dl::map_source ) + ; + + m.def( "findsul", dl::findsul ); + m.def( "findvrl", dl::findvrl ); + + m.def( "findoffsets", []( mio::mmap_source& file, long long from ) { + const auto ofs = dl::findoffsets( file, from ); return py::make_tuple( ofs.tells, ofs.residuals, ofs.explicits ); }); m.def( "marks", [] ( const std::string& path ) { - auto marks = dl::findoffsets( path ); + mio::mmap_source file; + dl::map_source( file, path ); + auto marks = dl::findoffsets( file, 80 ); return py::make_tuple( marks.residuals, marks.tells ); }); } diff --git a/python/setup.py b/python/setup.py index 5f6a995a5..9ad9cf1b8 100755 --- a/python/setup.py +++ b/python/setup.py @@ -80,6 +80,7 @@ def getversion(): include_dirs = ['../lib/include', '../lib/extension', '../external/mpark', + '../external/mio', get_pybind_include(), get_pybind_include(user=True), ], diff --git a/python/tests/test_core.py b/python/tests/test_core.py index dd502ed01..b122721d2 100644 --- a/python/tests/test_core.py +++ b/python/tests/test_core.py @@ -22,7 +22,7 @@ def test_sul(): assert sul == d - with dlisio.open('data/206_05a-_3_DWL_DWL_WIRE_258276498.DLIS') as f: + with dlisio.load('data/206_05a-_3_DWL_DWL_WIRE_258276498.DLIS') as f: assert f.storage_label() == d # The example record from the specification @@ -139,12 +139,12 @@ def test_sul(): ]) def test_objects(): - with dlisio.open('data/206_05a-_3_DWL_DWL_WIRE_258276498.DLIS') as f: + with dlisio.load('data/206_05a-_3_DWL_DWL_WIRE_258276498.DLIS') as f: objects = f.objects assert len(list(objects)) == 876 def test_channels(): - with dlisio.open('data/206_05a-_3_DWL_DWL_WIRE_258276498.DLIS') as f: + with dlisio.load('data/206_05a-_3_DWL_DWL_WIRE_258276498.DLIS') as f: channel = next(f.channels) assert channel.name.id == "TDEP" assert channel.name.origin == 2 @@ -160,7 +160,7 @@ def test_channels(): assert channel.source is None def test_frames(): - with dlisio.open('data/206_05a-_3_DWL_DWL_WIRE_258276498.DLIS') as f: + with dlisio.load('data/206_05a-_3_DWL_DWL_WIRE_258276498.DLIS') as f: frame = next(f.frames) assert frame.name.id == "2000T" assert frame.name.origin == 2 @@ -179,7 +179,7 @@ def test_frames(): assert len(fchannels) == len(frame.channels) def test_tools(): - with dlisio.open('data/206_05a-_3_DWL_DWL_WIRE_258276498.DLIS') as f: + with dlisio.load('data/206_05a-_3_DWL_DWL_WIRE_258276498.DLIS') as f: tool = next(f.tools) assert tool.name.id == "MSCT" assert tool.name.origin == 2 @@ -201,7 +201,7 @@ def test_tools(): assert len(tools) == 1 def test_parameters(): - with dlisio.open('data/206_05a-_3_DWL_DWL_WIRE_258276498.DLIS') as f: + with dlisio.load('data/206_05a-_3_DWL_DWL_WIRE_258276498.DLIS') as f: param = next(f.parameters) assert param.name.id == "FLSHSTRM" assert param.name.origin == 2 @@ -216,7 +216,7 @@ def test_parameters(): assert len(param) == 1 def test_calibrations(): - with dlisio.open('data/206_05a-_3_DWL_DWL_WIRE_258276498.DLIS') as f: + with dlisio.load('data/206_05a-_3_DWL_DWL_WIRE_258276498.DLIS') as f: calibration = next(f.calibrations) assert calibration.name.id == "CNU" assert calibration.name.origin == 2 @@ -238,7 +238,7 @@ def test_calibrations(): assert uncal_ch[0] == calibration def test_contains(): - with dlisio.open('data/206_05a-_3_DWL_DWL_WIRE_258276498.DLIS') as f: + with dlisio.load('data/206_05a-_3_DWL_DWL_WIRE_258276498.DLIS') as f: frame = f.getobject(("2000T", 2, 0), type="frame") name = ("TDEP", 2, 4) channel = f.getobject(name, type="channel") @@ -249,16 +249,28 @@ def test_contains(): assert result == True def test_Unknown(): - with dlisio.open('data/206_05a-_3_DWL_DWL_WIRE_258276498.DLIS') as f: + with dlisio.load('data/206_05a-_3_DWL_DWL_WIRE_258276498.DLIS') as f: unknown = next(f.unknowns) assert unknown.type == "unknown" assert len(list(f.unknowns)) == 515 def test_object(): - with dlisio.open('data/206_05a-_3_DWL_DWL_WIRE_258276498.DLIS') as f: + with dlisio.load('data/206_05a-_3_DWL_DWL_WIRE_258276498.DLIS') as f: name = ("2000T", 2, 0) frame = f.getobject(name=name, type='frame') assert frame.name.id == "2000T" assert frame.name.origin == 2 assert frame.name.copynumber == 0 + +def test_load_pre_sul_garbage(): + with dlisio.load('data/pre-sul-garbage.dlis') as f: + with dlisio.load('data/only-channels.dlis') as g: + assert f.storage_label() == g.storage_label() + assert f.sul_offset == 12 + +def test_load_pre_vrl_garbage(): + with dlisio.load('data/pre-sul-pre-vrl-garbage.dlis') as f: + with dlisio.load('data/only-channels.dlis') as g: + assert f.storage_label() == g.storage_label() + assert f.sul_offset == 12