Skip to content

Commit

Permalink
Merge pull request #52 from equinor/files-extra-bytes-before-after-sul
Browse files Browse the repository at this point in the history
Files extra bytes before after sul
  • Loading branch information
jokva authored Feb 28, 2019
2 parents 529568a + 7a13c21 commit 62a2bc7
Show file tree
Hide file tree
Showing 11 changed files with 242 additions and 31 deletions.
2 changes: 2 additions & 0 deletions config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,11 @@ function pre_build {
if [ -n "$IS_OSX" ]; then
sudo make install;
sudo cp -r ../external/mpark/mpark /usr/local/include;
sudo cp -r ../external/mio/mio /usr/local/include;
else
make install;
cp -r ../external/mpark/mpark /usr/local/include;
cp -r ../external/mio/mio /usr/local/include;
fi

popd
Expand Down
11 changes: 10 additions & 1 deletion lib/extension/dlisio/ext/io.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#include <tuple>
#include <vector>

#include <mio/mio.hpp>

#include <dlisio/ext/types.hpp>

namespace dl {
Expand Down Expand Up @@ -59,7 +61,14 @@ struct stream_offsets {
void resize( std::size_t ) noexcept (false);
};

stream_offsets findoffsets( const std::string& path ) noexcept (false);
void map_source( mio::mmap_source&, const std::string& ) noexcept (false);

long long findsul( mio::mmap_source& file ) noexcept (false);
long long findvrl( mio::mmap_source& path, long long from ) noexcept (false);

stream_offsets findoffsets( mio::mmap_source& path,
long long from )
noexcept (false);

}

Expand Down
6 changes: 6 additions & 0 deletions lib/extension/dlisio/ext/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ struct not_implemented : public std::logic_error {
{}
};

struct not_found : public std::runtime_error {
explicit not_found( const std::string& msg )
: runtime_error( msg )
{}
};

enum class representation_code : std::uint8_t {
fshort = DLIS_FSHORT,
fsingl = DLIS_FSINGL,
Expand Down
11 changes: 11 additions & 0 deletions lib/src/dlisio.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -717,6 +717,15 @@ int dlis_index_records( const char* begin,

if (end - DLIS_VRL_SIZE < ptr) return DLIS_TRUNCATED;

/*
* 2.3.6.4 Minimum Visible Record Length
* Since record segments must be at least 16 bytes, the
* effective minimum length for a visible record is 20 bytes
* (including itself), so anything less than that means
* corrupted data
*/
if (len < 20) return DLIS_UNEXPECTED_VALUE;

remaining = len - DLIS_VRL_SIZE;
ptr += DLIS_VRL_SIZE;
}
Expand All @@ -728,6 +737,8 @@ int dlis_index_records( const char* begin,

if (end - len < ptr) return DLIS_TRUNCATED;

if (len < 16) return DLIS_UNEXPECTED_VALUE;

ptr += len;
remaining -= len;

Expand Down
145 changes: 133 additions & 12 deletions lib/src/io.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,126 @@ void stream_offsets::resize( std::size_t n ) noexcept (false) {
this->explicits.resize( n );
}

stream_offsets findoffsets( const std::string& path ) noexcept (false) {
void map_source( mio::mmap_source& file, const std::string& path ) noexcept (false) {
std::error_code syserror;
mio::mmap_source file;
file.map( path, 0, mio::map_entire_file, syserror );
if (syserror) throw std::system_error( syserror );

if (file.size() == 0)
throw std::invalid_argument( "non-existent or empty file" );
}

const auto* begin = file.data() + 80;
const auto* end = file.data() + file.size();
long long findsul( mio::mmap_source& file ) noexcept (false) {
/*
* search at most 200 bytes, looking for the SUL
*
* if it doesn't show up by then it's probably not there, or require other
* information
*
* Return the offset of the _first byte_ of the SUL. In a conforming file,
* this is 0.
*/
static const auto needle = "RECORD";
static const std::size_t search_limit = 200;

const auto first = file.data();
const auto last = first + (std::min)( file.size(), search_limit );
auto itr = std::search( first, last, needle, needle + 6 );

if (itr == last) {
const std::string msg = "searched "
+ std::to_string(search_limit)
+ " bytes, but could not find SUL"
;
throw dl::not_found( msg );
}

if (file.size() == 0)
throw std::invalid_argument( "empty file" );
/*
* Before the structure field of the SUL there should be 10 bytes, i.e.
* sequence-number and DLIS version.
*/
const auto structure_offset = 9;

if (std::distance( first, itr ) < structure_offset) {
auto pos = std::distance( first, itr );
const std::string msg = "found 'RECORD' at pos = "
+ std::to_string( pos )
+ ", but expected pos >= 10"
;
throw std::runtime_error( msg );
}

return std::distance( file.data(), itr - structure_offset );
}

long long findvrl( mio::mmap_source& file, long long from ) noexcept (false) {
/*
* The first VRL does sometimes not immediately follow the SUL (or whatever
* came before it), but according to spec it should be a triple of
* (len,0xFF,0x01), where len is a UNORM. The second half shouldn't change,
* so look for the first occurence of that.
*
* If that too doesn't work then the file is likely too corrupted to read
* without manual intervention
*/

if (from < 0) {
std::stringstream msg;
msg << "from (which is " << from << ") >= 0";
throw std::out_of_range(msg.str());
}

if (std::size_t(from) > file.size()) {
std::stringstream msg;
msg << "from (which is " << from << ") "
<< "<= file.size (which is " << file.size() << ")"
;
throw std::out_of_range(msg.str());
}

static const unsigned char needle[] = { 0xFF, 0x01 };
static const auto search_limit = 200;

const auto limit = std::min< long long >(file.size() - from, search_limit);

/*
* reinterpret the bytes as usigned char*. This is compatible and fine.
*
* When operator == is ued on the elements, they'll otherwise be promoted
* to int, so all of a sudden (char)0xFF != (unsigned char)0xFF. Forcing
* the pointer to be unsigend char fixes this issue.
*/
const auto front = reinterpret_cast< const unsigned char* >(file.data());
const auto first = front + from;
const auto last = first + limit;
const auto itr = std::search(first, last, needle, needle + sizeof(needle));

if (itr == last) {
std::stringstream msg;
msg << "searched " << limit << " bytes, but could not find VRL";
throw dl::not_found( msg.str() );
}

/*
* Before the 0xFF 0x01 there should be room for at least an unorm
*/
if (std::distance( first, itr ) < DLIS_SIZEOF_UNORM) {
auto pos = std::distance( first, itr );
std::stringstream msg;
msg << "found 0xFF 0x01 at pos = " << from + pos
<< ", but expected pos >= " << from + DLIS_SIZEOF_UNORM
;
throw std::runtime_error(msg.str());
}

return std::distance(front, itr - DLIS_SIZEOF_UNORM);
}

stream_offsets findoffsets( mio::mmap_source& file, long long from )
noexcept (false)
{
const auto* begin = file.data() + from;
const auto* end = file.data() + file.size();

// by default, assume ~4K per segment on average. This should be fairly few
// reallocations, without overshooting too much
Expand Down Expand Up @@ -63,14 +171,27 @@ stream_offsets findoffsets( const std::string& path ) noexcept (false) {
count + residuals.data(),
count + explicits.data() );

if (err == DLIS_TRUNCATED)
throw std::runtime_error( "file truncated" );
switch (err) {
case DLIS_OK: break;

if (err == DLIS_INCONSISTENT)
throw std::runtime_error( "inconsistensies in record sizes" );
case DLIS_TRUNCATED:
throw std::runtime_error( "file truncated" );

if (err)
throw std::runtime_error( "unknown error " + std::to_string( err ) );
case DLIS_INCONSISTENT:
throw std::runtime_error( "inconsistensies in record sizes" );

case DLIS_UNEXPECTED_VALUE: {
std::stringstream msg;
// TODO: interrogate more?
msg << "record-length in record " << count << " corrupted";
throw std::runtime_error(msg.str());
}

default:
throw std::runtime_error(
"unknown error " + std::to_string( err )
);
}

if (next == end) break;

Expand Down
Binary file added python/data/pre-sul-garbage.dlis
Binary file not shown.
Binary file added python/data/pre-sul-pre-vrl-garbage.dlis
Binary file not shown.
49 changes: 44 additions & 5 deletions python/dlisio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@
pass

class dlis(object):
def __init__(self, stream, explicits):
def __init__(self, stream, explicits, sul_offset = 80):
self.file = stream
self.explicit_indices = explicits
self.object_sets = None
self._objects = Objectpool(self.objectsets())
self.sul_offset = sul_offset

def __enter__(self):
return self
Expand All @@ -22,7 +23,7 @@ def __exit__(self, type, value, traceback):
self.file.close()

def storage_label(self):
blob = self.file.get(bytearray(80), 0, 80)
blob = self.file.get(bytearray(80), self.sul_offset, 80)
return core.storage_label(blob)

def objectsets(self, reload = False):
Expand Down Expand Up @@ -116,14 +117,52 @@ def unknowns(self):
return self._objects.unknowns

def open(path):
tells, residuals, explicits = core.findoffsets(path)
""" Open a file
Open a low-level file handle. This is not intended for end-users - rather,
it's an escape hatch for very broken files that dlisio cannot handle.
Parameters
----------
path : str_like
Returns
-------
stream : dlisio.core.stream
See Also
--------
dlisio.load
"""
return core.stream(str(path))

def load(path):
""" Load a file
Parameters
----------
path : str_like
Returns
-------
dlis : dlisio.dlis
"""
path = str(path)

mmap = core.mmap_source()
mmap.map(path)

sulpos = core.findsul(mmap)
vrlpos = core.findvrl(mmap, sulpos + 80)

tells, residuals, explicits = core.findoffsets(mmap, vrlpos)
explicits = [i for i, explicit in enumerate(explicits) if explicit != 0]

stream = core.stream(path)
stream = open(path)

try:
stream.reindex(tells, residuals)
f = dlis(stream, explicits)
f = dlis(stream, explicits, sul_offset = sulpos)
except:
stream.close()
raise
Expand Down
16 changes: 13 additions & 3 deletions python/dlisio/ext/core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -361,13 +361,23 @@ PYBIND11_MODULE(core, m) {
return objects;
});

m.def( "findoffsets", []( const std::string& path ) {
const auto ofs = dl::findoffsets( path );
py::class_< mio::mmap_source >( m, "mmap_source" )
.def( py::init<>() )
.def( "map", dl::map_source )
;

m.def( "findsul", dl::findsul );
m.def( "findvrl", dl::findvrl );

m.def( "findoffsets", []( mio::mmap_source& file, long long from ) {
const auto ofs = dl::findoffsets( file, from );
return py::make_tuple( ofs.tells, ofs.residuals, ofs.explicits );
});

m.def( "marks", [] ( const std::string& path ) {
auto marks = dl::findoffsets( path );
mio::mmap_source file;
dl::map_source( file, path );
auto marks = dl::findoffsets( file, 80 );
return py::make_tuple( marks.residuals, marks.tells );
});
}
1 change: 1 addition & 0 deletions python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def getversion():
include_dirs = ['../lib/include',
'../lib/extension',
'../external/mpark',
'../external/mio',
get_pybind_include(),
get_pybind_include(user=True),
],
Expand Down
Loading

0 comments on commit 62a2bc7

Please sign in to comment.