Skip to content

Commit

Permalink
fix: add proper parsing for missing note attributes
Browse files Browse the repository at this point in the history
  • Loading branch information
vzhd1701 committed Dec 13, 2021
1 parent 267e565 commit 562ddc6
Show file tree
Hide file tree
Showing 10 changed files with 116 additions and 69 deletions.
84 changes: 23 additions & 61 deletions enex2notion/enex_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,64 +5,15 @@
import re
import uuid
from collections import defaultdict
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import List
from xml.etree import ElementTree

from dateutil.parser import isoparse

logger = logging.getLogger(__name__)

from enex2notion.enex_types import EvernoteNote, EvernoteResource

@dataclass(frozen=True)
class EvernoteResource(object):
data_bin: bytes
size: int
md5: str
mime: str
file_name: str


@dataclass
class EvernoteNote(object):
title: str
created: datetime
updated: datetime
content: str # noqa: WPS110
tags: List[str]
author: str
url: str
is_webclip: bool
resources: List[EvernoteResource]
_note_hash: str = None

def resource_by_md5(self, md5):
for resource in self.resources:
if resource.md5 == md5:
return resource
return None

@property
def note_hash(self):
if self._note_hash is None:
hashable = [
self.title,
self.created.isoformat(),
self.updated.isoformat(),
self.content,
"".join(self.tags),
self.author,
self.url,
]

s1_hash = hashlib.sha1()
for h in hashable:
s1_hash.update(h.encode("utf-8"))
self._note_hash = s1_hash.hexdigest() # noqa: WPS601

return self._note_hash
logger = logging.getLogger(__name__)


def iter_notes(enex_file: Path):
Expand Down Expand Up @@ -108,30 +59,39 @@ def _etree_to_dict(t): # noqa: WPS210, WPS231, C901


def _process_note(note_raw: dict):
if not note_raw:
note_raw = {}

note_attrs = note_raw.get("note-attributes") or {}

note_tags = note_raw.get("tag", [])
if isinstance(note_tags, str):
note_tags = [note_tags]

note_resources = note_raw.get("resource", [])
if isinstance(note_resources, dict):
note_resources = [note_resources]
resources = [_convert_resource(r) for r in note_resources]
now = datetime.now()

return EvernoteNote(
title=note_raw["title"],
created=isoparse(note_raw["created"]),
updated=isoparse(note_raw["updated"]),
content=note_raw["content"],
title=note_raw.get("title", "Untitled"),
created=isoparse(note_raw.get("created", now.isoformat())),
updated=isoparse(note_raw.get("updated", now.isoformat())),
content=note_raw.get("content", ""),
tags=note_tags,
author=note_attrs.get("author", ""),
url=note_attrs.get("source-url", ""),
is_webclip=_is_webclip(note_raw),
resources=resources,
resources=_parse_resources(note_raw),
)


def _parse_resources(note_raw):
note_resources = note_raw.get("resource", [])

if isinstance(note_resources, dict):
note_resources = [note_resources]

return [_convert_resource(r) for r in note_resources]


def _is_webclip(note_raw: dict):
note_attrs = note_raw.get("note-attributes") or {}

Expand All @@ -141,7 +101,9 @@ def _is_webclip(note_raw: dict):
return True

return bool(
re.match('<div[^>]+style="[^"]+en-clipped-content[^"]*"', note_raw["content"])
re.match(
'<div[^>]+style="[^"]+en-clipped-content[^"]*"', note_raw.get("content", "")
)
)


Expand Down
53 changes: 53 additions & 0 deletions enex2notion/enex_types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import hashlib
from dataclasses import dataclass
from datetime import datetime
from typing import List


@dataclass(frozen=True)
class EvernoteResource(object):
data_bin: bytes
size: int
md5: str
mime: str
file_name: str


@dataclass
class EvernoteNote(object):
title: str
created: datetime
updated: datetime
content: str # noqa: WPS110
tags: List[str]
author: str
url: str
is_webclip: bool
resources: List[EvernoteResource]
_note_hash: str = None

def resource_by_md5(self, md5):
for resource in self.resources:
if resource.md5 == md5:
return resource
return None

@property
def note_hash(self):
if self._note_hash is None:
hashable = [
self.title,
self.created.isoformat(),
self.updated.isoformat(),
self.content,
"".join(self.tags),
self.author,
self.url,
]

s1_hash = hashlib.sha1()
for h in hashable:
s1_hash.update(h.encode("utf-8"))
self._note_hash = s1_hash.hexdigest() # noqa: WPS601

return self._note_hash
2 changes: 1 addition & 1 deletion enex2notion/enex_uploader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from notion.block import CollectionViewPageBlock, PageBlock
from progress.bar import Bar

from enex2notion.enex_parser import EvernoteNote
from enex2notion.enex_types import EvernoteNote
from enex2notion.note_parser_dom import logger
from enex2notion.note_uploader import upload_block

Expand Down
2 changes: 1 addition & 1 deletion enex2notion/note_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from bs4 import BeautifulSoup

from enex2notion.enex_parser import EvernoteNote
from enex2notion.enex_types import EvernoteNote
from enex2notion.note_parser_dom import logger, parse_note_dom
from enex2notion.notion_blocks import TextProp
from enex2notion.notion_blocks_container import NotionCalloutBlock
Expand Down
2 changes: 1 addition & 1 deletion enex2notion/note_parser_e_media.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from bs4 import Tag
from w3lib.url import parse_data_uri

from enex2notion.enex_parser import EvernoteResource
from enex2notion.enex_types import EvernoteResource
from enex2notion.notion_blocks_embeddable import NotionImageEmbedBlock
from enex2notion.notion_blocks_uploadable import (
NotionAudioBlock,
Expand Down
2 changes: 1 addition & 1 deletion enex2notion/note_uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from notion.block import FileBlock
from notion.settings import S3_URL_PREFIX

from enex2notion.enex_parser import EvernoteResource
from enex2notion.enex_types import EvernoteResource
from enex2notion.notion_blocks_uploadable import NotionUploadableBlock


Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from notion.block import PageBlock
from notion.client import NotionClient

from enex2notion.enex_parser import EvernoteResource
from enex2notion.enex_types import EvernoteResource


@pytest.fixture()
Expand Down
34 changes: 33 additions & 1 deletion tests/test_enex_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import pytest
from dateutil.tz import tzutc

from enex2notion.enex_parser import EvernoteNote, EvernoteResource, iter_notes
from enex2notion.enex_parser import iter_notes
from enex2notion.enex_types import EvernoteNote, EvernoteResource


def test_iter_notes_single(fs):
Expand Down Expand Up @@ -486,3 +487,34 @@ def test_iter_notes_single_with_empty_resource(fs, caplog):
notes[0].resource_by_md5("d41d8cd98f00b204e9800998ecf8427e")
== expected_resource
)


def test_iter_notes_single_empty(fs, mocker):
test_enex = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE en-export SYSTEM "http://xml.evernote.com/pub/evernote-export4.dtd">
<en-export export-date="20211218T085932Z" application="Evernote" version="10.25.6">
<note>
</note>
</en-export>
"""
fs.create_file("test.enex", contents=test_enex)

fake_now = datetime.datetime(2021, 11, 18, 8, 0, 0, tzinfo=tzutc())
mock_iter = mocker.patch("enex2notion.enex_parser.datetime")
mock_iter.now.return_value = fake_now

test_note = list(iter_notes(Path("test.enex")))[0]

assert test_note.note_hash == "479b590db4f4d8817f93d01af51f21c894815920"
assert test_note == EvernoteNote(
title="Untitled",
created=fake_now,
updated=fake_now,
content="",
tags=[],
author="",
url="",
is_webclip=False,
resources=[],
_note_hash="479b590db4f4d8817f93d01af51f21c894815920",
)
2 changes: 1 addition & 1 deletion tests/test_enex_uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from dateutil.tz import tzutc
from notion.block import CalloutBlock, CollectionViewPageBlock, FileBlock, PageBlock

from enex2notion.enex_parser import EvernoteNote
from enex2notion.enex_types import EvernoteNote
from enex2notion.enex_uploader import get_import_root, upload_note
from enex2notion.enex_uploader_modes import get_notebook_database, get_notebook_page
from enex2notion.note_parser import parse_note
Expand Down
2 changes: 1 addition & 1 deletion tests/test_note_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from bs4 import BeautifulSoup
from dateutil.tz import tzutc

from enex2notion.enex_parser import EvernoteNote, EvernoteResource
from enex2notion.enex_types import EvernoteNote, EvernoteResource
from enex2notion.note_parser import parse_note
from enex2notion.note_parser_dom import parse_note_dom
from enex2notion.notion_blocks import (
Expand Down

0 comments on commit 562ddc6

Please sign in to comment.