Support HD image container for some Mobi books

choryuidentify · Dec 19, 2019 · 3b8d615 · 3b8d615
1 parent b71ed38
commit 3b8d615
Show file tree

Hide file tree

Showing 2 changed files with 249 additions and 0 deletions.
diff --git a/dedrm_src/k4mobidedrm.py b/dedrm_src/k4mobidedrm.py
@@ -86,12 +86,14 @@ class DrmException(Exception):
     from calibre_plugins.dedrm import kgenpids
     from calibre_plugins.dedrm import androidkindlekey
     from calibre_plugins.dedrm import kfxdedrm
+    from calibre_plugins.dedrm import mobimergehdimage
 else:
     import mobidedrm
     import topazextract
     import kgenpids
     import androidkindlekey
     import kfxdedrm
+    import mobimergehdimage
 
 # Wrap a stream so that output gets flushed immediately
 # and also make sure that any unicode strings get
@@ -242,6 +244,27 @@ def GetDecryptedBook(infile, kDatabases, androidFiles, serials, pids, starttime
         raise
 
     print u"Decryption succeeded after {0:.1f} seconds".format(time.time()-starttime)
+
+    if mobi:
+        import glob
+
+        bookparentpath = os.path.dirname(os.path.abspath(infile))
+        azwresfiles = glob.glob(bookparentpath + "/*.azw.res")
+        if azwresfiles != []:
+            if len(azwresfiles) == 1:
+                print u"HDImage Container file is found: %s" % os.path.basename(azwresfiles[0])
+                print u"HDImage merge start..."
+                try:
+                    hdimage_merger = mobimergehdimage.MobiMergeHDImage(mb.mobi_data)
+                    hdimage_merger.load_azwres(azwresfiles[0])
+                    mb.mobi_data = hdimage_merger.merge()
+                    print u"HDImage merge succeeded after {0:.1f} seconds".format(time.time()-starttime)
+                except:
+                    print u"Merge failed. Skipping..."
+            else:
+                for azwres in azwresfiles:
+                    print u"HDImage Containor is exist more then 1. Skipping..."
+
     return mb
 
 

diff --git a/dedrm_src/mobimergehdimage.py b/dedrm_src/mobimergehdimage.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import struct, locale, imghdr, os
+
+#################################################################################
+#                                                                               #
+#                             MOBI HDImage Merger                               #
+#                                                                               #
+#                       2019 Choryu Park (Kyoungkyu Park)                       #
+#                                                                               #
+#################################################################################
+
+# This source code is based on these documents, and source codes;
+#   https://wiki.mobileread.com/wiki/MOBI
+#   https://wiki.mobileread.com/wiki/PDB
+#   https://www.mobileread.com/forums/showpost.php?p=3114050&postcount=1145
+#
+# Thanks to author of 'DumpAZW6'. It many helps to write this source code.
+
+#################################################################################
+# * Important; In this source code, First record is 0. not 1.
+#   So, Last record is Record count - 1.
+#
+# AZW.RES (HDContainer) Structure ->
+# HDContainer is including HD images. each images have own record, and it's Starts with 'CRES'.
+# Images is following main book's image ordering, but not include thumbnail.
+# and if image not need to HD resolution (like logo, special character), that record marked as 'Placeholder' for matching image count.
+#
+# -------------------------------
+# | Book        | HDContainer   | Starts with |
+# |-------------|-------------- |
+# | Image 1 (SD)| HDImage 1     | <- b'CRES'
+# | Image 2     | Placeholder   | <- b'\xA0\xA0\xA0\xA0'
+# | Image 3 (SD)| HDImage 3     | <- b'CRES'
+# | Image 4 (SD)| HDImage 4     | <- b'CRES'
+# | Thumbnail   | *Not exist    |
+# -------------------------------
+# * This structure is sample. Originally, there is no index number.
+
+## Header offsets for Book file
+# 2-byte unsigned short
+BHDR_OFFSET_NUM_OF_RECORD = 76
+
+# 4-byte unsigned long
+BHDR_OFFSET_FILE_IDENT = 60
+BHDR_RECORD0_OFFSET_TEXT_ENCODING = 28
+BHDR_RECORD0_OFFSET_FULL_NAME_OFFSET = 84
+BHDR_RECORD0_OFFSET_FULL_NAME_LENGTH = 88
+BHDR_RECORD0_OFFSET_FIRST_IMAGE_INDEX = 108
+
+# 8-byte Record info structure
+#  - [record Data Offset] 4-byte unsinged long
+#  - [unused] 4-byte
+BHDR_OFFSET_RECORD_INFO_LIST = 78
+
+## Header offsets for HDContainer file
+CHDR_OFFSET_FILE_IDENT = 60
+CHDR_RECORD0_OFFSET_TEXT_ENCODING = 12
+CHDR_RECORD0_OFFSET_FULL_NAME_OFFSET = 40
+CHDR_RECORD0_OFFSET_FULL_NAME_LENGTH = 44
+
+TEXT_ENCODING_MAP = {
+    65001                   : "UTF-8",
+    1252                    : "windows-1252"
+}
+
+CONTAINER_CONTENT_TYPE_IMAGE = "IMAGE"
+CONTAINER_CONTENT_TYPE_PLACEHOLDER = "PLACEHOLDER"
+
+CONTAINER_NEEDED_TYPES = {
+    b"CRES"                 : CONTAINER_CONTENT_TYPE_IMAGE,
+    b"\xa0\xa0\xa0\xa0"     : CONTAINER_CONTENT_TYPE_PLACEHOLDER
+}
+
+# Actually, this function is grabbed from 'DumpAZW6'. That script can getting from URL where in header of this code.
+def get_image_type(imgdata):
+    imgtype = imghdr.what(None, imgdata)
+
+    # horrible hack since imghdr detects jxr/wdp as tiffs
+    if imgtype is not None and imgtype == "tiff":
+        imgtype = "wdp"
+
+    # imghdr only checks for JFIF or Exif JPEG files. Apparently, there are some
+    # with only the magic JPEG bytes out there...
+    # ImageMagick handles those, so, do it too.
+    if imgtype is None:
+        if imgdata[0:2] == b'\xFF\xD8':
+            # Get last non-null bytes
+            last = len(imgdata)
+            while (imgdata[last-1:last] == b'\x00'):
+                last-=1
+            # Be extra safe, check the trailing bytes, too.
+            if imgdata[last-2:last] == b'\xFF\xD9':
+                imgtype = "jpeg"
+    return imgtype
+
+def get_charset(data, offset):
+    charset_id, = struct.unpack_from(">L", data, offset)
+
+    if charset_id in TEXT_ENCODING_MAP:
+        return TEXT_ENCODING_MAP[charset_id]
+    else:
+        print u"Unknown Charset %d" % charset_id
+        raise
+
+def get_book_title(data, base_offset, name_offset, length_offset, charset):
+    name_offset, = struct.unpack_from(">L", data, base_offset + name_offset)
+    name_length, = struct.unpack_from(">L", data, base_offset + length_offset)
+    name_offset += base_offset
+
+    return data[name_offset:name_offset + name_length].decode(charset)
+
+class MobiMergeHDImage:
+    hdimage_dict = None
+
+    def __init__(self, mobi_data):
+        self.mobi = mobi_data
+        self.record_dict = self.get_record_dict(self.mobi)
+        if self.mobi[BHDR_OFFSET_FILE_IDENT:BHDR_OFFSET_FILE_IDENT+8] != b'BOOKMOBI':
+            print u"This eBook is not a Mobi."
+            raise
+        self.charset = get_charset(self.mobi, self.record_dict[0]["OFFSET"] + BHDR_RECORD0_OFFSET_TEXT_ENCODING)
+        self.book_title = get_book_title(self.mobi, self.record_dict[0]["OFFSET"], BHDR_RECORD0_OFFSET_FULL_NAME_OFFSET, BHDR_RECORD0_OFFSET_FULL_NAME_LENGTH, self.charset)
+
+    def get_record_dict(self, data):
+        current_offset = BHDR_OFFSET_NUM_OF_RECORD
+        record_count, = struct.unpack_from(">H", data, current_offset)
+        record_dict = dict()
+
+        record_dict["COUNT"] = record_count
+
+        for index in range(0, record_count):
+            record_dict[index] = dict()
+
+            offset = BHDR_OFFSET_RECORD_INFO_LIST + index * 8
+
+            record_dict[index]["INFO_OFFSET"] = offset
+            record_dict[index]["OFFSET"], = struct.unpack_from(">L", data, offset)
+
+        return record_dict
+
+    def load_azwres(self, res_file):
+        with open(res_file, 'rb') as azwresfile:
+            azwres = azwresfile.read()
+        if azwres[CHDR_OFFSET_FILE_IDENT:CHDR_OFFSET_FILE_IDENT + 8] != b'RBINCONT':
+            print u"%s is not a HDImage container." % os.path.basename(res_file)
+            raise
+
+        azwres_dict = self.get_record_dict(azwres)
+
+        azwres_title = get_book_title(azwres, azwres_dict[0]["OFFSET"], CHDR_RECORD0_OFFSET_FULL_NAME_OFFSET, CHDR_RECORD0_OFFSET_FULL_NAME_LENGTH, self.charset)
+
+        if self.book_title != azwres_title:
+            print u"Book mismatch. Book title and HDImage container title is not same. "
+            print u"Book: %s, Container: %s" % (self.book_title, azwres_title)
+            raise
+
+        self.hdimage_dict = dict()
+
+        image_index = 0
+        for index, val in sorted(azwres_dict.items()):
+            if index != "COUNT" and azwres[val["OFFSET"]:val["OFFSET"] + 2] != b"\xe9\x8e":
+                if azwres[val["OFFSET"]:val["OFFSET"] + 4] in CONTAINER_NEEDED_TYPES:
+                    self.hdimage_dict[image_index] = dict()
+                    self.hdimage_dict[image_index]["INDEX"] = index
+                    self.hdimage_dict[image_index]["TYPE"] = CONTAINER_NEEDED_TYPES[azwres[val["OFFSET"]:val["OFFSET"] + 4]]
+                    if self.hdimage_dict[image_index]["TYPE"] == CONTAINER_CONTENT_TYPE_IMAGE:
+                        self.hdimage_dict[image_index]["CONTENT"] = azwres[val["OFFSET"] + 12:azwres_dict[index + 1]["OFFSET"]]
+
+                    image_index += 1
+
+    def record_offset_update(self, record_index, modified_offset_size):
+        for target_record_index in range(record_index + 1, self.record_dict["COUNT"]):
+            self.record_dict[target_record_index]["OFFSET"] = self.record_dict[target_record_index]["OFFSET"] + modified_offset_size
+            self.mobi = "".join(
+                (
+                    self.mobi[:self.record_dict[target_record_index]["INFO_OFFSET"]],
+                    struct.pack(">L", self.record_dict[target_record_index]["OFFSET"]),
+                    self.mobi[self.record_dict[target_record_index]["INFO_OFFSET"] + 4:]
+                )
+            )
+
+    def merge(self):
+        if self.hdimage_dict is None:
+            print u"'azw.res' file is not loaded yet."
+            raise
+
+        first_record_offset = self.record_dict[0]['OFFSET']
+
+        # Find original book's image
+        first_image_index, = struct.unpack_from(">L", self.mobi, first_record_offset + 108)
+
+        images_dict = dict()
+        image_index = 0
+        for record_index in range(first_image_index, self.record_dict["COUNT"]):
+            if record_index + 1 == self.record_dict["COUNT"]:
+                img = self.mobi[self.record_dict[record_index]["OFFSET"]:]
+            else:    
+                img = self.mobi[self.record_dict[record_index]["OFFSET"]:self.record_dict[record_index + 1]["OFFSET"]]
+
+            if get_image_type(img) is not None:
+                images_dict[image_index] = dict()
+                images_dict[image_index]["INDEX"] = record_index
+                images_dict[image_index]["CONTENT"] = img
+                image_index += 1
+
+        # Merge
+        for index, hdimage in self.hdimage_dict.items():
+            if hdimage["TYPE"] == CONTAINER_CONTENT_TYPE_IMAGE:
+                # If HDImage type is IMAGE, do merge.
+                image_offset = self.record_dict[images_dict[index]["INDEX"]]["OFFSET"]
+
+                if self.mobi.find(images_dict[index]["CONTENT"]) != -1:
+                    size_difference = len(hdimage["CONTENT"]) - len(images_dict[index]["CONTENT"])
+
+                    self.mobi = self.mobi.replace(images_dict[index]["CONTENT"], hdimage["CONTENT"])
+
+                    self.record_offset_update(images_dict[index]["INDEX"], size_difference)
+
+            else:
+                # Skip if it's not a image (Maybe PLACEHOLDER)
+                pass
+
+        return self.mobi
+