From 05c2f2e67a120a1cb79e59e5037b62797fe00a3f Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 18 Aug 2019 17:52:49 +0530
Subject: [PATCH 70/71] Get parse_index() working for foreign affairs AJAX
backend
---
recipes/foreignaffairs.recipe | 176 +++++++++++++++++++++++++---------
1 file changed, 133 insertions(+), 43 deletions(-)
diff --git a/recipes/foreignaffairs.recipe b/recipes/foreignaffairs.recipe
index a3f5436d61..60f095db5c 100644
--- a/recipes/foreignaffairs.recipe
+++ b/recipes/foreignaffairs.recipe
@@ -1,12 +1,12 @@
#!/usr/bin/env python2
-from calibre.web.feeds.news import BasicNewsRecipe
+import json
import re
+
import html5lib
+import mechanize
from lxml import html
-
-def select_form(form):
- return form.attrs.get('id', None) == 'user-login'
+from calibre.web.feeds.news import BasicNewsRecipe
def classes(classes):
@@ -15,6 +15,123 @@ def classes(classes):
'class': lambda x: x and frozenset(x.split()).intersection(q)})
+def as_article(source, log):
+ url = source['url']
+ title = source['title']
+ desc = ''
+ if source.get('field_subtitle'):
+ desc += source['field_subtitle']
+ if source.get('field_display_authors'):
+ desc += ' by ' + source['field_display_authors']
+ log(title, url)
+ return {'url': url, 'title': title, 'description': desc}
+
+
+def get_issue_data(br, log, node_id='1124670'):
+ headers = {
+ 'Accept': 'application/json, text/plain, */*',
+ 'Content-Type': 'application/json;charset=UTF-8',
+ 'Origin': 'https://www.foreignaffairs.com',
+ 'Referer': 'https://www.foreignaffairs.com',
+ }
+ data = {
+ "_source": {
+ "includes": [
+ "normalized_date", "field_issue_volume_number",
+ "field_issue_volume", "url", "fa_path", "title",
+ "fa_node_issue_cover_url", "nid",
+ "field_issue_ssection_header",
+ "field_issue_ssection_articles:nid"
+ ]
+ },
+ "query": {
+ "match": {
+ "id": {
+ "query": node_id
+ }
+ }
+ },
+ "size": 1
+ }
+
+ def get_data(data):
+ search_url = 'https://www.foreignaffairs.com/node/_search'
+ req = mechanize.Request(url=search_url,
+ data=json.dumps(data),
+ headers=headers,
+ method='POST')
+ res = br.open(req)
+ return json.loads(res.read())['hits']['hits']
+
+ issue_data = get_data(data)
+ source = issue_data[0]['_source']
+ nids = source['field_issue_ssection_articles:nid']
+ section_title = source['field_issue_ssection_header']
+
+ data = {
+ '_source': {
+ 'includes': [
+ 'field_tags:name', 'field_topics:name', 'field_regions:name',
+ 'url', 'title', 'field_subtitle', 'field_display_authors',
+ 'nid', 'fa_node_has_audio', 'fa_node_paywall_free',
+ 'field_capsule_review_category:name',
+ 'fa_node_type_or_subtype', 'type'
+ ]
+ },
+ 'query': {
+ 'terms': {
+ 'id': nids
+ }
+ },
+ 'size': 30
+ }
+
+ sections_data = get_data(data)
+ log('Found main section:', section_title)
+ main_articles = []
+ for article in sections_data:
+ main_articles.append(as_article(article['_source'], log))
+ feed = {}
+
+ data['size'] = 100
+ data['query'] = {
+ 'bool': {
+ 'must': [{
+ 'terms': {
+ 'fa_node_type_or_subtype': [
+ 'Comment', 'Essay', 'Interview', 'Review Essay',
+ 'Letter From', 'Letter', 'Response', 'Capsule Review'
+ ]
+ }
+ }, {
+ 'term': {
+ 'field_issue:nid': {
+ 'term': '1124670'
+ }
+ }
+ }],
+ 'must_not': [{
+ 'terms': {
+ 'id': nids
+ }
+ }]
+ }
+ }
+
+ article_data = get_data(data)
+ for article in article_data:
+ article = article['_source']
+ section = article['fa_node_type_or_subtype']
+ if section not in feed:
+ feed[section] = []
+ feed[section].append(as_article(article, log))
+ ans = []
+ for sec in sorted(feed):
+ ans.append((sec, feed[sec]))
+
+ return [(section_title, main_articles)] + ans
+
+
class ForeignAffairsRecipe(BasicNewsRecipe):
''' there are three modifications:
@@ -55,43 +172,18 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
'publisher': publisher}
def parse_index(self):
- answer = []
soup = self.index_to_soup(self.FRONTPAGE)
- div = soup.find(
- 'div', attrs={'class': 'magazine-actions'})
- self.cover_url = div.find('img')['ng-src']
# get dates
date = re.split(r'\s\|\s', self.tag_to_string(
soup.head.title.string))[0]
self.title = "Foreign Affairs ({})".format(date)
self.timefmt = u' [%s]' % date
-
- # Fetching article list does not work as site uses javascript
- # to load articles dynamically
- for section in soup.findAll('section', attrs={'class':lambda x: x and 'magazine-list' in x.split()}):
- articles = []
- section_title = self.tag_to_string(section.find('h2'))
- if 'special_section.title' in section_title:
- section_title = 'Special'
- self.log('\nSection:', section_title)
- for h3 in section.findAll(attrs={'class': lambda x: x and 'magazine-title' in x.split()}):
- a = h3.findParent('a', href=True)
- title = self.tag_to_string(h3)
- url = a['href']
- atr = a.findNextSibling(attrs={'class':'author'})
- author = self.tag_to_string(atr) if atr else ''
- desc = a.findNextSibling(attrs={'class': 'deck'})
- if desc is not None:
- description = self.tag_to_string(desc)
- else:
- description = ''
- articles.append({'title': title, 'url': url,
- 'description': description, 'author': author})
- self.log(title)
- self.log('\t' + url)
- if articles:
- answer.append((section_title, articles))
- return answer
+ cls = soup.find('body')['class']
+ if isinstance(cls, (list, tuple)):
+ cls = ' '.join(cls)
+ node_id = re.search(r'\bpage-node-(\d+)\b', cls).group(1)
+ br = self.cloned_browser
+ return get_issue_data(br, self.log, node_id)
def clean_fa_html(self, root):
for svg in tuple(root.iter('{*}svg')):
@@ -104,7 +196,7 @@ def preprocess_raw_html(self, raw_html, url):
root = html5lib.parse(raw_html, treebuilder='lxml',
namespaceHTMLElements=False).getroot()
self.clean_fa_html(root)
- return html.tostring(root)
+ return html.tostring(root, encoding='unicode')
def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'ng-src': True}):
@@ -112,16 +204,14 @@ def preprocess_html(self, soup):
return soup
def get_browser(self):
+
+ def select_form(form):
+ return form.attrs.get('id', None) == 'user-login'
+
br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None:
- # mechanize fails to parse the html correctly, so use html5lib to
- # sanitize the html first
- response = br.open(
+ br.open(
'https://www.foreignaffairs.com/user?destination=user%3Fop%3Dlo')
- root = html5lib.parse(
- response.get_data(), treebuilder='lxml', namespaceHTMLElements=False)
- response.set_data(html.tostring(root))
- br.set_response(response)
br.select_form(predicate=select_form)
br.form['name'] = self.username
br.form['pass'] = self.password