Skip to content

Commit

Permalink
Fixes #4
Browse files Browse the repository at this point in the history
  • Loading branch information
vedadian committed Jul 16, 2023
1 parent b44c341 commit 41f7630
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 38 deletions.
1 change: 1 addition & 0 deletions libsrc/dla.h
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,7 @@ struct stuDocTextBlock : public stuDocBlock {

struct stuDocFigureBlock : public stuDocBlock {
clsDocBlockPtr Caption;
DocBlockPtrVector_t TextBlocks;
stuDocFigureBlock() : stuDocBlock(enuDocBlockType::Figure) {}
};

Expand Down
51 changes: 32 additions & 19 deletions libsrc/pdfla.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -372,12 +372,16 @@ clsPdfLaInternals::findPageLinesAndFigures(
if (Item->BoundingBox.area() <=
MAX_IMAGE_BLOB_AREA_FACTOR * _pageSize.area()) {
DocItemPtr_t SameFigure{nullptr};
for (auto &ResultFigureItem : ResultFigures)
if (ResultFigureItem->BoundingBox.hasIntersectionWith(
Item->BoundingBox)) {
for (auto &ResultFigureItem : ResultFigures) {
auto Intersection =
ResultFigureItem->BoundingBox.intersectWith(Item->BoundingBox);
// TODO: Handle these magic constants
if (Intersection.width() > -10.f * MIN_ITEM_SIZE &&
Intersection.height() > -10.f * MIN_ITEM_SIZE) {
SameFigure = ResultFigureItem;
break;
}
}
if (SameFigure.get() != nullptr)
SameFigure->BoundingBox.unionWith_(Item->BoundingBox);
else
Expand Down Expand Up @@ -626,6 +630,8 @@ DocBlockPtrVector_t clsPdfLaInternals::findTextBlocks(
DocBlockPtrVector_t Result;
do {
auto Line = getFirstUnusedLine();
if(Line.get() == nullptr)
break;
if (UsedLines.find(Line) != UsedLines.end()) {
std::cout << "THIS MUST NEVER HAPPEN" << std::endl;
}
Expand All @@ -645,7 +651,8 @@ DocBlockPtrVector_t clsPdfLaInternals::findTextBlocks(
for (const auto &OtherLine :
sortedByBoundingBoxes<enuBoundingBoxOrdering::L2R>(
filter(SortedLines, [&Line](const DocLinePtr_t &_otherLine) {
return _otherLine->BoundingBox.verticalOverlap(
return _otherLine.get() != Line.get() &&
_otherLine->BoundingBox.verticalOverlap(
Line->BoundingBox) > MIN_ITEM_SIZE;
}))) {
if (UsedLines.find(OtherLine) == UsedLines.end()) {
Expand Down Expand Up @@ -829,11 +836,7 @@ DocBlockPtrVector_t clsPdfLaInternals::findTextBlocks(
return b->BoundingBox.hasIntersectionWith(Result[i]->BoundingBox);
});
if (OverlappingBlocks.empty()) continue;
clsPdfLaDebug::instance()
.createImage(this)
.add(OverlappingBlocks)
.add(Result[i])
.show("OVB");

// Gather all lines in all these overlapping blocks
DocLinePtrVector_t AllLines = Result[i].asText()->Lines;
for (auto &Block : OverlappingBlocks)
Expand Down Expand Up @@ -903,14 +906,6 @@ DocBlockPtrVector_t clsPdfLaInternals::findTextBlocks(
BestBlock.asText()->Lines.push_back(Line);
}
// Add new blocks to results
clsPdfLaDebug::instance()
.createImage(this)
.add(filter(NewBlocks,
[](const clsDocBlockPtr &b) {
return !b.asText()->Lines.empty();
}))
.show("OVB2");

for (auto &Block : NewBlocks) {
if (Block.asText()->Lines.empty()) continue;
Result.emplace_back(std::move(Block));
Expand Down Expand Up @@ -981,14 +976,32 @@ DocBlockPtrVector_t clsPdfLaInternals::getPageBlocks(size_t _pageIndex) {
auto [Lines, Figures] = std::move(
this->findPageLinesAndFigures(SortedChars, SortedFigures, WhitespaceCover,
MeanCharWidth, MeanCharHeight, PageSize));

auto [FreeLines, LinesInFigures] =
split(Lines, [&, Figures = Figures](const DocLinePtr_t &l) {
return !any(Figures, [&](const DocItemPtr_t &f) {
return f->BoundingBox.contains(l->BoundingBox);
});
});

auto Blocks =
this->findTextBlocks(stuBoundingBox(stuPoint(0.f, 0.f), PageSize), Lines,
Figures, WhitespaceCover);
this->findTextBlocks(stuBoundingBox(stuPoint(0.f, 0.f), PageSize),
FreeLines, Figures, WhitespaceCover);

for (const auto Item : Figures) {
clsDocBlockPtr FigureBlock;
FigureBlock.reset(new stuDocFigureBlock);
FigureBlock->BoundingBox = Item->BoundingBox;
auto FigureLines = filter(LinesInFigures, [&](const DocLinePtr_t &l) {
return Item->BoundingBox.contains(l->BoundingBox);
});
if (FigureLines.size() > 0) {
FigureBlock.asFigure()->TextBlocks = findTextBlocks(
Item->BoundingBox, FigureLines, DocItemPtrVector_t(),
filter(WhitespaceCover, [&](const BoundingBoxPtr_t &w) {
return w->hasIntersectionWith(Item->BoundingBox);
}));
}
Blocks.push_back(FigureBlock);
}

Expand Down
50 changes: 31 additions & 19 deletions tests/blackboxTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,17 @@ cv::Rect bbox2CvRect(const stuBoundingBox &_bbox) {
}

void processPdfFile(const std::string &_pdfFilePath, const std::string &_stem,
const std::string &_debugOut, const std::vector<size_t> _pageIndexes, bool _enableDebugging = false) {
const std::string &_debugOut,
const std::vector<size_t> _pageIndexes,
bool _enableDebugging = false) {
auto PdfFileContent = readFileContents(_pdfFilePath.data());
auto PdfLa =
std::make_shared<clsPdfLa>(PdfFileContent.data(), PdfFileContent.size());

if(_enableDebugging)
PdfLa->enableDebugging(fs::path(_pdfFilePath).stem());
if (_enableDebugging) PdfLa->enableDebugging(fs::path(_pdfFilePath).stem());

std::vector<size_t> PageIndexes = _pageIndexes;
if(PageIndexes.size() == 0) {
if (PageIndexes.size() == 0) {
for (size_t PageIndex = 0; PageIndex < PdfLa->pageCount(); ++PageIndex)
PageIndexes.push_back(PageIndex);
}
Expand All @@ -62,25 +63,35 @@ void processPdfFile(const std::string &_pdfFilePath, const std::string &_stem,

auto PageMatrixData = PdfLa->renderPageImage(PageIndex, 0xffffffff, Size);
cv::Mat PageImage(Size.Height, Size.Width, CV_8UC3, PageMatrixData.data());
for (auto &Block : Blocks) {
auto Item = Block->BoundingBox;
cv::Rect R = bbox2CvRect(Item.scale(Scale));

if(R.area() < 4)
continue;
auto markBoundingBox = [&](const stuBoundingBox &_boundingBox,
const cv::Scalar &_fillColor) {
cv::Rect R = bbox2CvRect(_boundingBox.scale(Scale));
if (R.area() < 4) return;

auto ROI = PageImage(R);
auto RoiCopy = ROI.clone();

cv::rectangle(RoiCopy, cv::Rect(0, 0, R.width, R.height), _fillColor, -1);
cv::rectangle(RoiCopy, cv::Rect(0, 0, R.width, R.height),
cv::Scalar(200, 200, 0), 3);
cv::addWeighted(ROI, 0.3, RoiCopy, 0.7, 0, ROI);
};

for (auto &Block : Blocks) {
cv::Scalar FillColor;
if (Block->Type == enuDocBlockType::Text)
FillColor = cv::Scalar(100, 0, 0);
else
FillColor = cv::Scalar(0, 100, 0);

auto ROI = PageImage(R);
auto RoiCopy = ROI.clone();
markBoundingBox(Block->BoundingBox, FillColor);

cv::rectangle(RoiCopy, cv::Rect(0, 0, R.width, R.height), FillColor, -1);
cv::rectangle(RoiCopy, cv::Rect(0, 0, R.width, R.height), cv::Scalar(200, 200, 0), 3);
cv::addWeighted(ROI, 0.3, RoiCopy, 0.7, 0, ROI);
if (Block->Type == enuDocBlockType::Figure) {
cv::Scalar FillColor(0, 0, 100);
for (const auto &Block : Block.asFigure()->TextBlocks) {
markBoundingBox(Block->BoundingBox, FillColor);
}
}
}

std::ostringstream ss;
Expand All @@ -97,8 +108,8 @@ int main(void) {
const std::string DebugOutputPath =
"/data/Work/Targoman/InternalProjects/TarjomyarV2/PDFA/debug";

const std::vector<std::tuple<std::string, std::vector<size_t>>> ChosenPdfs {
{"bi-1097.pdf", { 5 } },
const std::vector<std::tuple<std::string, std::vector<size_t>>> ChosenPdfs{
{"bi-1097.pdf", {2}},
};
std::vector<std::tuple<fs::path, std::vector<size_t>>> PdfFilePaths;
if (ChosenPdfs.size()) {
Expand All @@ -119,8 +130,9 @@ int main(void) {

std::cout << "Searching `" << BasePath << "` ..." << std::endl;
for (auto &[Path, Pages] : PdfFilePaths) {
std::cout << Path.native() << std::endl;
processPdfFile(Path, Path.stem(), DebugOutputPath, Pages, ChosenPdfs.size() > 0);
std::cout << Path.native() << std::endl;
processPdfFile(Path, Path.stem(), DebugOutputPath, Pages,
ChosenPdfs.size() > 0);
}
return 0;
}

0 comments on commit 41f7630

Please sign in to comment.