From 1b01d660b616aeee783ce9169cd9e9a0ac6e5a60 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 12 Aug 2019 10:10:50 +0530
Subject: [PATCH 62/71] Cleanup HTML metadata parsing
---
setup/test.py | 2 +
src/calibre/ebooks/metadata/html.py | 167 ++++++++++++----------------
2 files changed, 74 insertions(+), 95 deletions(-)
diff --git a/setup/test.py b/setup/test.py
index 6638dda11d..5d28f4a433 100644
--- a/setup/test.py
+++ b/setup/test.py
@@ -110,6 +110,8 @@ def find_tests(which_tests=None):
if ok('ebooks'):
from calibre.ebooks.metadata.rtf import find_tests
a(find_tests())
+ from calibre.ebooks.metadata.html import find_tests
+ a(find_tests())
if ok('misc'):
from calibre.ebooks.metadata.tag_mapper import find_tests
a(find_tests())
diff --git a/src/calibre/ebooks/metadata/html.py b/src/calibre/ebooks/metadata/html.py
index c6d4693baf..48408bc3a8 100644
--- a/src/calibre/ebooks/metadata/html.py
+++ b/src/calibre/ebooks/metadata/html.py
@@ -12,14 +12,15 @@ import re
import unittest
from collections import defaultdict
-from HTMLParser import HTMLParser
+from html5_parser import parse
+from lxml.etree import Comment
from calibre.ebooks.metadata import string_to_authors, authors_to_string
from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.chardet import xml_to_unicode
from calibre import replace_entities, isbytestring
from calibre.utils.date import parse_date, is_date_undefined
-from polyglot.builtins import iteritems, itervalues
+from polyglot.builtins import iteritems
def get_metadata(stream):
@@ -56,90 +57,76 @@ META_NAMES = {
'comments': ('comments', 'dc.description'),
'tags': ('tags',),
}
+rmap_comment = {v:k for k, v in iteritems(COMMENT_NAMES)}
+rmap_meta = {v:k for k, l in iteritems(META_NAMES) for v in l}
+
# Extract an HTML attribute value, supports both single and double quotes and
# single quotes inside double quotes and vice versa.
attr_pat = r'''(?:(?P<sq>')|(?P<dq>"))(?P<content>(?(sq)[^']+|[^"]+))(?(sq)'|")'''
+
+def handle_comment(data, comment_tags):
+ if not hasattr(handle_comment, 'pat'):
+ handle_comment.pat = re.compile(r'''(?P<name>\S+)\s*=\s*%s''' % attr_pat)
+ for match in handle_comment.pat.finditer(data):
+ x = match.group('name')
+ field = None
+ try:
+ field = rmap_comment[x]
+ except KeyError:
+ pass
+ if field:
+ comment_tags[field].append(replace_entities(match.group('content')))
+
+
def parse_metadata(src):
- class MetadataParser(HTMLParser):
- def __init__(self):
- self.comment_tags = defaultdict(list)
- self.meta_tag_ids = defaultdict(list)
- self.meta_tags = defaultdict(list)
- self.title_tag = ''
-
- self.recording = False
- self.recorded = []
-
- self.rmap_comment = {v:k for k, v in iteritems(COMMENT_NAMES)}
- self.rmap_meta = {v:k for k, l in iteritems(META_NAMES) for v in l}
-
- HTMLParser.__init__(self)
-
- def handle_starttag(self, tag, attrs):
- attr_dict = dict(attrs)
-
- if tag == 'title':
- self.recording = True
- self.recorded = []
-
- elif tag == 'meta' and re.match(r'(?:dc|dcterms)[.:]identifier(?:\.|$)', attr_dict.get('name', ''), flags=re.IGNORECASE):
- scheme = None
- if re.match(r'(?:dc|dcterms)[.:]identifier$', attr_dict.get('name', ''), flags=re.IGNORECASE):
- scheme = attr_dict.get('scheme', '').strip()
- elif 'scheme' not in attr_dict:
- elements = re.split(r'[.:]', attr_dict['name'])
- if len(elements) == 3:
- scheme = elements[2].strip()
- if scheme:
- self.meta_tag_ids[scheme.lower()].append(attr_dict.get('content', ''))
-
- elif tag == 'meta':
- x = attr_dict.get('name', '').lower()
- field = None
- try:
- field = self.rmap_meta[x]
- except KeyError:
- try:
- field = self.rmap_meta[x.replace(':', '.')]
- except KeyError:
- pass
- if field:
- self.meta_tags[field].append(attr_dict.get('content', ''))
-
- def handle_data(self, data):
- if self.recording:
- self.recorded.append(data)
-
- def handle_charref(self, ref):
- if self.recording:
- self.recorded.append(replace_entities("&#%s;" % ref))
-
- def handle_entityref(self, ref):
- if self.recording:
- self.recorded.append(replace_entities("&%s;" % ref))
-
- def handle_endtag(self, tag):
- if tag == 'title':
- self.recording = False
- self.title_tag = ''.join(self.recorded)
-
- def handle_comment(self, data):
- for match in re.finditer(r'''(?P<name>\S+)\s*=\s*%s''' % (attr_pat), data):
- x = match.group('name')
- field = None
+ root = parse(src)
+ comment_tags = defaultdict(list)
+ meta_tags = defaultdict(list)
+ meta_tag_ids = defaultdict(list)
+ title = ''
+ identifier_pat = re.compile(r'(?:dc|dcterms)[.:]identifier(?:\.|$)', flags=re.IGNORECASE)
+ id_pat2 = re.compile(r'(?:dc|dcterms)[.:]identifier$', flags=re.IGNORECASE)
+
+ for comment in root.iterdescendants(tag=Comment):
+ if comment.text:
+ handle_comment(comment.text, comment_tags)
+
+ for q in root.iterdescendants(tag='title'):
+ if q.text:
+ title = q.text
+ break
+
+ for meta in root.iterdescendants(tag='meta'):
+ name, content = meta.get('name'), meta.get('content')
+ if not name or not content:
+ continue
+ if identifier_pat.match(name) is not None:
+ scheme = None
+ if id_pat2.match(name) is not None:
+ scheme = meta.get('scheme')
+ else:
+ elements = re.split(r'[.:]', name)
+ if len(elements) == 3 and not meta.get('scheme'):
+ scheme = elements[2].strip()
+ if scheme:
+ meta_tag_ids[scheme.lower()].append(content)
+ else:
+ x = name.lower()
+ field = None
+ try:
+ field = rmap_meta[x]
+ except KeyError:
try:
- field = self.rmap_comment[x]
+ field = rmap_meta[x.replace(':', '.')]
except KeyError:
pass
- if field:
- self.comment_tags[field].append(replace_entities(match.group('content')))
+ if field:
+ meta_tags[field].append(content)
- parser = MetadataParser()
- parser.feed(src)
+ return comment_tags, meta_tags, meta_tag_ids, title
- return (parser.comment_tags, parser.meta_tags, parser.meta_tag_ids, parser.title_tag)
def get_metadata_(src, encoding=None):
# Meta data definitions as in
@@ -151,7 +138,7 @@ def get_metadata_(src, encoding=None):
else:
src = src.decode(encoding, 'replace')
src = src[:150000] # Searching shouldn't take too long
- (comment_tags, meta_tags, meta_tag_ids, title_tag) = parse_metadata(src)
+ comment_tags, meta_tags, meta_tag_ids, title_tag = parse_metadata(src)
def get_all(field):
ans = comment_tags.get(field, meta_tags.get(field, None))
@@ -257,7 +244,10 @@ def get_metadata_(src, encoding=None):
class MetadataHtmlTest(unittest.TestCase):
def compare_metadata(self, meta_a, meta_b):
- for attr in ('title', 'authors', 'publisher', 'isbn', 'languages', 'pubdate', 'timestamp', 'series', 'series_index', 'rating', 'comments', 'tags', 'identifiers'):
+ for attr in (
+ 'title', 'authors', 'publisher', 'isbn', 'languages', 'pubdate', 'timestamp', 'series',
+ 'series_index', 'rating', 'comments', 'tags', 'identifiers'
+ ):
self.assertEqual(getattr(meta_a, attr), getattr(meta_b, attr))
def get_stream(self, test):
@@ -324,7 +314,7 @@ class MetadataHtmlTest(unittest.TestCase):
<!-- SERIES="Comment Series" -->
<!-- SERIESNUMBER="3" -->
<!-- RATING="20" -->
- <!-- COMMENTS="comment "comments" ♥ HTML too &amp;" -->
+ <!-- COMMENTS="comment "comments" ♥ HTML -- too &amp;" -->
<!-- TAGS="tag d" -->
'''
@@ -340,7 +330,7 @@ class MetadataHtmlTest(unittest.TestCase):
<!-- SERIES="Comment Series 2" -->
<!-- SERIESNUMBER="4" -->
<!-- RATING="1" -->
- <!-- COMMENTS="comment "comments" ♥ HTML too &amp; for sure" -->
+ <!-- COMMENTS="comment "comments" ♥ HTML -- too &amp; for sure" -->
<!-- TAGS="tag e, tag f" -->
'''
@@ -352,13 +342,11 @@ class MetadataHtmlTest(unittest.TestCase):
'''
return BytesIO(raw)
-
def test_input_title(self):
stream_meta = get_metadata(self.get_stream('title'))
canon_meta = Metadata('A Title Tag & Title Ⓒ', [_('Unknown')])
self.compare_metadata(stream_meta, canon_meta)
-
def test_input_meta_single(self):
stream_meta = get_metadata(self.get_stream('meta_single'))
canon_meta = Metadata('A Meta Tag & Title Ⓒ', ['George Washington'])
@@ -374,7 +362,6 @@ class MetadataHtmlTest(unittest.TestCase):
canon_meta.set_identifiers({'isbn': '1234567890'})
self.compare_metadata(stream_meta, canon_meta)
-
def test_input_meta_multi(self):
stream_meta = get_metadata(self.get_stream('meta_multi'))
canon_meta = Metadata('A Meta Tag & Title Ⓒ', ['George Washington', 'John Adams', 'Thomas Jefferson'])
@@ -390,7 +377,6 @@ class MetadataHtmlTest(unittest.TestCase):
canon_meta.set_identifiers({'isbn': '1234567890', 'url': 'http://google.com/search?q=calibre'})
self.compare_metadata(stream_meta, canon_meta)
-
def test_input_comment_single(self):
stream_meta = get_metadata(self.get_stream('comment_single'))
canon_meta = Metadata('A Comment Tag & Title Ⓒ', ['James Madison', 'James Monroe'])
@@ -401,12 +387,11 @@ class MetadataHtmlTest(unittest.TestCase):
canon_meta.series = 'Comment Series'
canon_meta.series_index = float(3)
canon_meta.rating = float(0)
- canon_meta.comments = 'comment "comments" ♥ HTML too &amp;'
+ canon_meta.comments = 'comment "comments" ♥ HTML -- too &amp;'
canon_meta.tags = ['tag d']
canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'})
self.compare_metadata(stream_meta, canon_meta)
-
def test_input_comment_multi(self):
stream_meta = get_metadata(self.get_stream('comment_multi'))
canon_meta = Metadata('A Comment Tag & Title Ⓒ', ['James Madison', 'James Monroe', 'John Quincy Adams'])
@@ -417,19 +402,11 @@ class MetadataHtmlTest(unittest.TestCase):
canon_meta.series = 'Comment Series'
canon_meta.series_index = float(3)
canon_meta.rating = float(0)
- canon_meta.comments = 'comment "comments" ♥ HTML too &amp;'
+ canon_meta.comments = 'comment "comments" ♥ HTML -- too &amp;'
canon_meta.tags = ['tag d', 'tag e', 'tag f']
canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'})
self.compare_metadata(stream_meta, canon_meta)
-def suite():
+def find_tests():
return unittest.TestLoader().loadTestsFromTestCase(MetadataHtmlTest)
-
-
-def test():
- unittest.TextTestRunner(verbosity=2).run(suite())
-
-
-if __name__ == '__main__':
- test()