Skip to content

Commit

Permalink
Merge pull request #1027 from icecraft/refactor/move_defs
Browse files Browse the repository at this point in the history
refactor: move some constants or enums defs to config folder
  • Loading branch information
myhloli authored Nov 19, 2024
2 parents bc99243 + b492c19 commit 6c8f563
Show file tree
Hide file tree
Showing 43 changed files with 2,488 additions and 1,622 deletions.
53 changes: 53 additions & 0 deletions magic_pdf/config/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""span维度自定义字段."""
# span是否是跨页合并的
CROSS_PAGE = 'cross_page'

"""
block维度自定义字段
"""
# block中lines是否被删除
LINES_DELETED = 'lines_deleted'

# table recognition max time default value
TABLE_MAX_TIME_VALUE = 400

# pp_table_result_max_length
TABLE_MAX_LEN = 480

# table master structure dict
TABLE_MASTER_DICT = 'table_master_structure_dict.txt'

# table master dir
TABLE_MASTER_DIR = 'table_structure_tablemaster_infer/'

# pp detect model dir
DETECT_MODEL_DIR = 'ch_PP-OCRv4_det_infer'

# pp rec model dir
REC_MODEL_DIR = 'ch_PP-OCRv4_rec_infer'

# pp rec char dict path
REC_CHAR_DICT = 'ppocr_keys_v1.txt'

# pp rec copy rec directory
PP_REC_DIRECTORY = '.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer'

# pp rec copy det directory
PP_DET_DIRECTORY = '.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer'


class MODEL_NAME:
# pp table structure algorithm
TABLE_MASTER = 'tablemaster'
# struct eqtable
STRUCT_EQTABLE = 'struct_eqtable'

DocLayout_YOLO = 'doclayout_yolo'

LAYOUTLMv3 = 'layoutlmv3'

YOLO_V8_MFD = 'yolo_v8_mfd'

UniMerNet_v2_Small = 'unimernet_small'

RAPID_TABLE = 'rapid_table'
35 changes: 35 additions & 0 deletions magic_pdf/config/drop_reason.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
class DropReason:
TEXT_BLCOK_HOR_OVERLAP = 'text_block_horizontal_overlap' # 文字块有水平互相覆盖,导致无法准确定位文字顺序
USEFUL_BLOCK_HOR_OVERLAP = (
'useful_block_horizontal_overlap' # 需保留的block水平覆盖
)
COMPLICATED_LAYOUT = 'complicated_layout' # 复杂的布局,暂时不支持
TOO_MANY_LAYOUT_COLUMNS = 'too_many_layout_columns' # 目前不支持分栏超过2列的
COLOR_BACKGROUND_TEXT_BOX = 'color_background_text_box' # 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。
HIGH_COMPUTATIONAL_lOAD_BY_IMGS = (
'high_computational_load_by_imgs' # 含特殊图片,计算量太大,从而丢弃
)
HIGH_COMPUTATIONAL_lOAD_BY_SVGS = (
'high_computational_load_by_svgs' # 特殊的SVG图,计算量太大,从而丢弃
)
HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = 'high_computational_load_by_total_pages' # 计算量超过负荷,当前方法下计算量消耗过大
MISS_DOC_LAYOUT_RESULT = 'missing doc_layout_result' # 版面分析失败
Exception = '_exception' # 解析中发生异常
ENCRYPTED = 'encrypted' # PDF是加密的
EMPTY_PDF = 'total_page=0' # PDF页面总数为0
NOT_IS_TEXT_PDF = 'not_is_text_pdf' # 不是文字版PDF,无法直接解析
DENSE_SINGLE_LINE_BLOCK = 'dense_single_line_block' # 无法清晰的分段
TITLE_DETECTION_FAILED = 'title_detection_failed' # 探测标题失败
TITLE_LEVEL_FAILED = (
'title_level_failed' # 分析标题级别失败(例如一级、二级、三级标题)
)
PARA_SPLIT_FAILED = 'para_split_failed' # 识别段落失败
PARA_MERGE_FAILED = 'para_merge_failed' # 段落合并失败
NOT_ALLOW_LANGUAGE = 'not_allow_language' # 不支持的语种
SPECIAL_PDF = 'special_pdf'
PSEUDO_SINGLE_COLUMN = 'pseudo_single_column' # 无法精确判断文字分栏
CAN_NOT_DETECT_PAGE_LAYOUT = 'can_not_detect_page_layout' # 无法分析页面的版面
NEGATIVE_BBOX_AREA = 'negative_bbox_area' # 缩放导致 bbox 面积为负
OVERLAP_BLOCKS_CAN_NOT_SEPARATION = (
'overlap_blocks_can_t_separation' # 无法分离重叠的block
)
19 changes: 19 additions & 0 deletions magic_pdf/config/drop_tag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@

COLOR_BG_HEADER_TXT_BLOCK = 'color_background_header_txt_block'
PAGE_NO = 'page-no' # 页码
CONTENT_IN_FOOT_OR_HEADER = 'in-foot-header-area' # 页眉页脚内的文本
VERTICAL_TEXT = 'vertical-text' # 垂直文本
ROTATE_TEXT = 'rotate-text' # 旋转文本
EMPTY_SIDE_BLOCK = 'empty-side-block' # 边缘上的空白没有任何内容的block
ON_IMAGE_TEXT = 'on-image-text' # 文本在图片上
ON_TABLE_TEXT = 'on-table-text' # 文本在表格上


class DropTag:
PAGE_NUMBER = 'page_no'
HEADER = 'header'
FOOTER = 'footer'
FOOTNOTE = 'footnote'
NOT_IN_LAYOUT = 'not_in_layout'
SPAN_OVERLAP = 'span_overlap'
BLOCK_OVERLAP = 'block_overlap'
11 changes: 11 additions & 0 deletions magic_pdf/config/make_content_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
class MakeMode:
MM_MD = 'mm_markdown'
NLP_MD = 'nlp_markdown'
STANDARD_FORMAT = 'standard_format'


class DropMode:
WHOLE_PDF = 'whole_pdf'
SINGLE_PAGE = 'single_page'
NONE = 'none'
NONE_WITH_REASON = 'none_with_reason'
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from enum import Enum


class ModelBlockTypeEnum(Enum):
TITLE = 0
PLAIN_TEXT = 1
ABANDON = 2
ISOLATE_FORMULA = 8
EMBEDDING = 13
ISOLATED = 14
ISOLATED = 14
File renamed without changes.
Loading

0 comments on commit 6c8f563

Please sign in to comment.