Cannot read europe file #5

ghost · 2016-07-29T16:00:16Z

I am experiencing issue with reading the 17 GB Europe extract. The application always crashes on line 186 in parsehelpers.h while trying to do
blobsRead += dbufs.size();
Somehow variables inFIle,processor,mtx,blobsRead,doProcessing,threadPrivateProcesor and maxBlobstoRead can´t be read by a thread, at least that´s what i see when break on exception in debugger. Please try the code i am trying to run if you experience same issues. The parser works with 1,4 GB Africa extract though. Both europe and africa extract were downlaoded from http://download.geofabrik.de/.

My example counter application accordign to your examples.

//leak detector
#include <vld.h>

////parser
#include <osmpbf/parsehelpers.h>
#include <osmpbf/inode.h>
#include <osmpbf/iway.h>
#include <osmpbf/irelation.h>
#include <osmpbf/filter.h>
#include <google/protobuf/stubs/common.h>


//cassandra driver - connecting to ScyllaDB
#include <cassandra.h>
#include <stdio.h>

//filtering functions
#include <highway/HighwayTagChecker.h>
#include <poi/PoiTagChecker.h>

//serialization to DB/disk
#include <serialization/disk/DiskSerializer.h>

//additional includes
#include <chrono>
#include <tuple>
#include <iostream>

using namespace std::chrono;
class commaPunct : public std::numpunct<char>
{
protected:
    virtual std::string do_grouping() const
    {
        return "\03";
    }

    virtual char do_thousands_sep() const
    {
        return ',';
    }
};


inline std::string primitiveTypeToString(osmpbf::PrimitiveType t) {
    switch (t) {
    case osmpbf::PrimitiveType::NodePrimitive:
        return "node";
    case osmpbf::PrimitiveType::WayPrimitive:
        return "way";
    case osmpbf::PrimitiveType::RelationPrimitive:
        return "relation";
    default:
        return "invalid";
    }
}


struct SharedState 
{
    std::mutex lock;

    //pocet elementov
    uint64_t nodeCount;
    uint64_t wayCount;
    uint64_t relationCount;

    //pocet ciest / pcoet hran pridanych do grafu
    uint64_t highway_count;
    uint64_t edge_count;

    //pocet poi
    uint64_t nodePoiCount;
    uint64_t wayPoiCount;
    SharedState() : nodeCount(0), wayCount(0), relationCount(0),nodePoiCount(0),wayPoiCount(0), highway_count(0), edge_count(0) {}
};


struct MyCounter 
{
    SharedState * state;
    uint64_t nodeCount;
    uint64_t wayCount;
    uint64_t relationCount;
    uint64_t highway_count;
    uint64_t edge_count;
    uint64_t nodePoiCount;
    uint64_t wayPoiCount;
    HighwayTagChecker ht;
    PoiTagChecker pt;
    Serializer serializer;

    MyCounter(SharedState * state) : state(state), nodeCount(0), wayCount(0), relationCount(0), highway_count(0), edge_count(0)
    {
        ht = HighwayTagChecker();
        pt = PoiTagChecker();
    }
    MyCounter(const MyCounter & other) : state(other.state), nodeCount(0), wayCount(0), relationCount(0), highway_count(0), edge_count(0)
    {
        ht = HighwayTagChecker();
        pt = PoiTagChecker();
    }
    void operator()(osmpbf::PrimitiveBlockInputAdaptor & pbi) 
    {
        nodeCount = wayCount = relationCount = nodePoiCount = wayPoiCount = edge_count = highway_count =  0;
        std::vector<Node> nodes;
        std::vector<Edge> edges;

        for (osmpbf::INodeStream node(pbi.getNodeStream()); !node.isNull(); node.next()) 
        {
            //std::cout << "<node id=" << node.id() << " lat=" << node.latd() << " lon=" << node.lond() << ">" << std::endl;
            Poi* poi = pt.getPoi(node);
            if (poi != nullptr)
            {
                ++nodePoiCount;
                delete poi;
            }

            Node n;
            n.id = node.id();
            n.lat = node.latd();
            n.lon = node.lond();
            nodes.push_back(n);

            ++nodeCount;
        }
        for (osmpbf::IWayStream way(pbi.getWayStream()); !way.isNull(); way.next())
        {
            //ak je highway, tak vytvor hrany
            HighWay* hw = ht.getHighway(way);
            if (hw != nullptr)
            {


                highway_count++;
            }


            Poi* poi = pt.getPoi(way);
            if (poi != nullptr)
            {
                ++wayPoiCount;
                delete poi;
            }

            ++wayCount;
        }
        for (osmpbf::IRelationStream rel(pbi.getRelationStream()); !rel.isNull(); rel.next())
        {
            /*std::cout << "<relation id=" << rel.id() << ">" << std::endl;
            for (osmpbf::IMemberStream mem(rel.getMemberStream()); !mem.isNull(); mem.next()) 
            {
                //std::cout << "\t<member type=" << primitiveTypeToString(mem.type()) << " ref=" << mem.id() << " role=" << mem.role() << "/>" << std::endl;
            }
            for (uint32_t i = 0, s = rel.tagsSize(); i < s; ++i) 
            {
                //std::cout << "\t<tag k=" << relation.key(i) << " v=" << relation.value(i) << ">" << std::endl;
            }
            //std::cout << "</relation>" << std::endl;
            */

            ++relationCount;
        }


        //now flush everything to shared state
        std::unique_lock<std::mutex> lck(state->lock);
        state->nodeCount += nodeCount;
        state->wayCount += wayCount;
        state->relationCount += relationCount;
        state->highway_count += highway_count;
        state->edge_count += edge_count;
        state->nodePoiCount += nodePoiCount;
        state->wayPoiCount += wayPoiCount;
    }
};




int main(int argc, char ** argv) 
{
    std::string fileName(argv[1]);
    SharedState state;
    osmpbf::OSMFileIn inFile(fileName);



    if (!inFile.open()) 
    {
        std::cout << "Failed to open " << fileName << std::endl;
        google::protobuf::ShutdownProtobufLibrary();
        return -1;
    }

    uint32_t threadCount = std::max<int>(std::thread::hardware_concurrency(), 1); //use 2 threads, usually 4 are more than enough
    uint32_t readBlobCount = 2; //parse 2 blocks at once
    bool threadPrivateProcessor = true; //set to true so that MyCounter is copied


    high_resolution_clock::time_point t1 = high_resolution_clock::now();
    osmpbf::parseFileCPPThreads(inFile, MyCounter(&state), threadCount, readBlobCount, threadPrivateProcessor);

    high_resolution_clock::time_point t2 = high_resolution_clock::now();
    int64_t duration = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();

    std::cout << "Celkovy cas " << duration << " ms" << std::endl;

    std::locale comma(std::locale(), new commaPunct());
    std::cout.imbue(comma);

    std::cout << "File " << fileName << " has the following amounts of matching primitives:\n";
    std::cout << "Nodes: " << state.nodeCount  << "\n";
    std::cout << "Ways: " << state.wayCount << "\n";
    std::cout << "Relations: " << state.relationCount << "\n";
    std::cout << "Highways: " << state.highway_count << "\n";
    std::cout << "Added edges: " << state.edge_count << "\n";
    std::cout << "Node poi: " << state.nodePoiCount << "\n";
    std::cout << "Way poi: " << state.wayPoiCount << "\n";
    std::cout << std::flush;

    google::protobuf::ShutdownProtobufLibrary();


    return 0;
}

The text was updated successfully, but these errors were encountered:

ghost · 2016-07-29T20:43:05Z

It even crashes when i use just the one thread
osmpbf::parseFile(inFile, MyCounter(&state));

dbahrdt · 2016-07-29T22:40:25Z

That's strange. I'm regularly using osmpbf to parse the plant.osm.obf. I'll try to get your program running.

dbahrdt · 2016-07-29T22:48:56Z

Does the filteredCount example work for you? It works for me:

$ /usr/bin/time -v ./filteredCount -k highway /data/osm/pbfs/europe-160714.osm.pbf
File /data/osm/pbfs/europe-160714.osm.pbf has the following amounts of matching primitives:
Nodes: 4233608
Ways: 45128632
Relations: 15238
Command being timed: "./filteredCount -k highway /data/osm/pbfs/europe-160714.osm.pbf"
User time (seconds): 576.24
System time (seconds): 4.77
Percent of CPU this job got: 185%
Elapsed (wall clock) time (h:mm:ss or m:ss): 5:13.75
Average shared text size (kbytes): 0
Average unshared data size (kbytes): 0
Average stack size (kbytes): 0
Average total size (kbytes): 0
Maximum resident set size (kbytes): 18215036
Average resident set size (kbytes): 0
Major (requiring I/O) page faults: 17428
Minor (reclaiming a frame) page faults: 972171
Voluntary context switches: 58995
Involuntary context switches: 746
Swaps: 0
File system inputs: 35691696
File system outputs: 0
Socket messages sent: 0
Socket messages received: 0
Signals delivered: 0
Page size (bytes): 4096
Exit status: 0
Please note that resident set size and shared size were almost always the same. osmpbf itself does not allocate any large amounts of memory. The OS will reclaim pages from osmpbf if it needs memory for other processes. File-backed read-only pages should have rather low priority compared to other pages.

ghost · 2016-07-29T23:20:49Z

Still crashes even on smaller 4 GB asia pbf after about 5 seconds of running at same place. Maybe you fixed that in your latest commit 85718e1 which i currently don´t have compiled. I will try it tomorrow if it resolves the issue.

dbahrdt · 2016-07-30T09:11:34Z

The commit you mention should not fix the issue. Did you try the filteredCount example on the europe export?

ghost · 2016-07-31T15:54:14Z

I have created pull request to fix reading larger files than 3 GB #6 . When the mman owner commits also small offset fix( i was inspired by you),we will have complete Windows support alitrack/mman-win32#6.

ghost · 2016-08-01T14:19:38Z

I don´t think i have properly fixed it though,because for Europe extract its working fine,but for planet file it starts to use all the memory.
For the Europe extract the max peak was
200 MB / 1,8 GB (commit/ working set)

For the world extract it is
500 MB/ 10 GB (commit / working set)

ghost · 2016-08-02T22:33:52Z

I think memory mapped files on Windows can´t automatically free up pages for larger files. Btw do we need to use memory mapped file? Can´t we just read it sequentially? There is a lock used anyways to prevent the threads from reading the file at once anyways, so i don´t understand of what´s the point of mapping it all to memory when only one thread can read a block at a time.

ghost · 2016-08-03T13:17:14Z

I ended up using https://github.com/osmcode/libosmium after all. It seems memory mapped files only work well on UNIX,because of better support. For world extract it was running with average 500 MB/ 400 MB (commit/working set). I haven´t studied their code thoroughly, but they are using memory buffers.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Cannot read europe file #5

Cannot read europe file #5

ghost commented Jul 29, 2016

ghost commented Jul 29, 2016

dbahrdt commented Jul 29, 2016

dbahrdt commented Jul 29, 2016

ghost commented Jul 29, 2016 •

edited by ghost

Loading

dbahrdt commented Jul 30, 2016

ghost commented Jul 31, 2016

ghost commented Aug 1, 2016 •

edited by ghost

Loading

ghost commented Aug 2, 2016

ghost commented Aug 3, 2016

Cannot read europe file #5

Cannot read europe file #5

Comments

ghost commented Jul 29, 2016

ghost commented Jul 29, 2016

dbahrdt commented Jul 29, 2016

dbahrdt commented Jul 29, 2016

ghost commented Jul 29, 2016 • edited by ghost Loading

dbahrdt commented Jul 30, 2016

ghost commented Jul 31, 2016

ghost commented Aug 1, 2016 • edited by ghost Loading

ghost commented Aug 2, 2016

ghost commented Aug 3, 2016

ghost commented Jul 29, 2016 •

edited by ghost

Loading

ghost commented Aug 1, 2016 •

edited by ghost

Loading