Blob Blame History Raw
From 9f1d7955f06cbbe626bd517b4f71bb97dbc6dc23 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 30 Jul 2019 19:31:06 +0530
Subject: [PATCH 27/71] Misc CHM Input fixes

Mostly to deal with chm files with broken filename encoding
---
 src/calibre/ebooks/chm/reader.py                   | 13 +++++++++++--
 src/calibre/ebooks/conversion/plugins/chm_input.py |  6 +++---
 src/calibre/utils/chm/chm.py                       |  2 +-
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/calibre/ebooks/chm/reader.py b/src/calibre/ebooks/chm/reader.py
index 983526232a..b5ce8981be 100644
--- a/src/calibre/ebooks/chm/reader.py
+++ b/src/calibre/ebooks/chm/reader.py
@@ -97,9 +97,14 @@ class CHMReader(CHMFile):
         return toc
 
     def ResolveObject(self, path):
+        opath = path
         if not isinstance(path, bytes):
             path = path.encode(self.chm_encoding)
-        return CHMFile.ResolveObject(self, path)
+        ans = CHMFile.ResolveObject(self, path)
+        if ans[0] != chmlib.CHM_RESOLVE_SUCCESS and not isinstance(opath, bytes):
+            path = opath.encode('utf-8')
+            ans = CHMFile.ResolveObject(self, path)
+        return ans
 
     def GetFile(self, path):
         # have to have abs paths for ResolveObject, but Contents() deliberately
@@ -280,7 +285,11 @@ class CHMReader(CHMFile):
         paths = []
 
         def get_paths(chm, ui, ctx):
-            path = as_unicode(ui.path, self.chm_encoding)
+            try:
+                path = as_unicode(ui.path, self.chm_encoding)
+            except UnicodeDecodeError:
+                path = as_unicode(ui.path, 'utf-8')
+
             # skip directories
             # note this path refers to the internal CHM structure
             if path[-1] != '/':
diff --git a/src/calibre/ebooks/conversion/plugins/chm_input.py b/src/calibre/ebooks/conversion/plugins/chm_input.py
index b28e31e74e..44f5de5f88 100644
--- a/src/calibre/ebooks/conversion/plugins/chm_input.py
+++ b/src/calibre/ebooks/conversion/plugins/chm_input.py
@@ -10,7 +10,7 @@ import os
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ptempfile import TemporaryDirectory
 from calibre.constants import filesystem_encoding
-from polyglot.builtins import unicode_type
+from polyglot.builtins import unicode_type, as_bytes
 
 
 class CHMInput(InputFormatPlugin):
@@ -170,7 +170,7 @@ class CHMInput(InputFormatPlugin):
                                    pretty_print=True)
                 f.write(raw)
             else:
-                f.write(hhcdata)
+                f.write(as_bytes(hhcdata))
         return htmlpath, toc
 
     def _read_file(self, name):
@@ -180,7 +180,7 @@ class CHMInput(InputFormatPlugin):
 
     def add_node(self, node, toc, ancestor_map):
         from calibre.ebooks.chm.reader import match_string
-        if match_string(node.attrib['type'], 'text/sitemap'):
+        if match_string(node.attrib.get('type', ''), 'text/sitemap'):
             p = node.xpath('ancestor::ul[1]/ancestor::li[1]/object[1]')
             parent = p[0] if p else None
             toc = ancestor_map.get(parent, toc)
diff --git a/src/calibre/utils/chm/chm.py b/src/calibre/utils/chm/chm.py
index 7be92acaa5..d313ab5620 100644
--- a/src/calibre/utils/chm/chm.py
+++ b/src/calibre/utils/chm/chm.py
@@ -506,7 +506,7 @@ class CHMFile:
         if not self.topics:
             self.topics = self.GetString(text, toc_index)
             if not self.topics.startswith(b"/"):
-                self.topics = "b/" + self.topics
+                self.topics = b"/" + self.topics
 
         if not self.index:
             self.index = self.GetString(text, idx_index)