5f720ea
From 39c86f23d401f9d7329d94fcbf32b51cbc003b8c Mon Sep 17 00:00:00 2001
5f720ea
From: Kovid Goyal <kovid@kovidgoyal.net>
5f720ea
Date: Sat, 20 Jul 2019 12:40:26 +0530
5f720ea
Subject: [PATCH 03/71] Update WSJ
5f720ea
5f720ea
Fixes #1837213 [Private bug](https://bugs.launchpad.net/calibre/+bug/1837213)
5f720ea
---
5f720ea
 recipes/wsj.recipe      | 98 +++++++++++++++++++++++------------------
5f720ea
 recipes/wsj_free.recipe | 98 +++++++++++++++++++++++------------------
5f720ea
 2 files changed, 110 insertions(+), 86 deletions(-)
5f720ea
5f720ea
diff --git a/recipes/wsj.recipe b/recipes/wsj.recipe
5f720ea
index da28f081b3..f40f3fedfe 100644
5f720ea
--- a/recipes/wsj.recipe
5f720ea
+++ b/recipes/wsj.recipe
5f720ea
@@ -5,10 +5,7 @@
5f720ea
 from __future__ import absolute_import, division, print_function, unicode_literals
5f720ea
 
5f720ea
 import json
5f720ea
-try:
5f720ea
-    from urllib.parse import quote
5f720ea
-except ImportError:
5f720ea
-    from urllib import quote
5f720ea
+from base64 import standard_b64encode
5f720ea
 
5f720ea
 from mechanize import Request
5f720ea
 
5f720ea
@@ -16,6 +13,16 @@
5f720ea
 from calibre.web.feeds.news import BasicNewsRecipe
5f720ea
 from css_selectors import Select
5f720ea
 
5f720ea
+try:
5f720ea
+    import urllib.parse as urlparse
5f720ea
+except ImportError:
5f720ea
+    import urlparse
5f720ea
+try:
5f720ea
+    from urllib.parse import quote
5f720ea
+except ImportError:
5f720ea
+    from urllib import quote
5f720ea
+
5f720ea
+
5f720ea
 needs_subscription = True
5f720ea
 
5f720ea
 
5f720ea
@@ -40,7 +47,7 @@ class WSJ(BasicNewsRecipe):
5f720ea
     ignore_duplicate_articles = {'url'}
5f720ea
     remove_attributes = ['style', 'data-scrim']
5f720ea
     needs_subscription = needs_subscription
5f720ea
-    WSJ_ITP = 'https://online.wsj.com/itp/today'
5f720ea
+    WSJ_ITP = 'https://www.wsj.com/print-edition/today'
5f720ea
 
5f720ea
     keep_only_tags = [
5f720ea
         dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')),
5f720ea
@@ -87,51 +94,56 @@ def get_cover_url(self):
5f720ea
     # login {{{
5f720ea
     if needs_subscription:
5f720ea
         def get_browser(self, *a, **kw):
5f720ea
-            # To understand the signin logic read signin.js from
5f720ea
-            # https://id.wsj.com/access/pages/wsj/us/signin.html
5f720ea
-            # This is the same login servie as used by Barrons
5f720ea
+            # To understand the login logic read app-min.js from
5f720ea
+            # https://sso.accounts.dowjones.com/login
5f720ea
+            itp = quote(self.WSJ_ITP, safe='')
5f720ea
+            start_url = 'https://accounts.wsj.com/login?target=' + itp
5f720ea
             kw['user_agent'] = random_user_agent(allow_ie=False)
5f720ea
             br = BasicNewsRecipe.get_browser(self, *a, **kw)
5f720ea
-            # self.wsj_itp_page = open('/t/raw.html').read()
5f720ea
-            # return br
5f720ea
-            url = 'https://id.wsj.com/access/pages/wsj/us/signin.html?mg=com-wsj&mg=id-wsj'
5f720ea
-            # br.set_debug_http(True)
5f720ea
-            br.open(url).read()
5f720ea
-            rurl = 'https://id.wsj.com/auth/submitlogin.json'
5f720ea
-            rq = Request(rurl, headers={
5f720ea
-                'Accept': 'application/json, text/javascript, */*; q=0.01',
5f720ea
+            self.log('Starting login process...')
5f720ea
+            res = br.open(start_url)
5f720ea
+            sso_url = res.geturl()
5f720ea
+            query =  urlparse.parse_qs(urlparse.urlparse(sso_url).query)
5f720ea
+            query = {k:v[0] for k, v in query.items()}
5f720ea
+            request_query = {
5f720ea
+                'username': self.username,
5f720ea
+                'password': self.password,
5f720ea
+                'client_id': query['client'],
5f720ea
+                'sso': 'true',
5f720ea
+                'tenant': 'sso',
5f720ea
+                '_intstate': 'deprecated',
5f720ea
+            }
5f720ea
+            for k in 'scope connection nonce state ui_locales ns protocol redirect_uri'.split():
5f720ea
+                request_query[k] = query[k]
5f720ea
+            login_url = 'https://sso.accounts.dowjones.com/usernamepassword/login'
5f720ea
+            # you can get the version below from lib-min.js
5f720ea
+            # search for: str: "x.x.x"
5f720ea
+            # This might need to be updated in the future
5f720ea
+            auth0_client = json.dumps({"name": "auth0.js", "version": "7.0.3"})
5f720ea
+            if not isinstance(auth0_client, bytes):
5f720ea
+                auth0_client = auth0_client.encode('utf-8')
5f720ea
+            auth0_client = standard_b64encode(auth0_client)
5f720ea
+            if isinstance(auth0_client, bytes):
5f720ea
+                auth0_client = auth0_client.decode('ascii')
5f720ea
+            rq = Request(login_url, headers={
5f720ea
+                'Accept': 'text/html',
5f720ea
                 'Accept-Language': 'en-US,en;q=0.8',
5f720ea
-                'Content-Type': 'application/json',
5f720ea
-                'Referer': url,
5f720ea
+                'Auth0-Client': auth0_client.rstrip('='),
5f720ea
                 'X-HTTP-Method-Override': 'POST',
5f720ea
                 'X-Requested-With': 'XMLHttpRequest',
5f720ea
-            }, data=json.dumps({
5f720ea
-                'username': self.username,
5f720ea
-                'password': self.password,
5f720ea
-                'realm': 'default',
5f720ea
-                'savelogin': 'true',
5f720ea
-                'template': 'default',
5f720ea
-                'url': quote(self.WSJ_ITP),
5f720ea
-            }))
5f720ea
-            r = br.open(rq)
5f720ea
-            if r.code != 200:
5f720ea
-                raise ValueError('Failed to login, check username and password')
5f720ea
-            data = json.loads(r.read())
5f720ea
-            # print(data)
5f720ea
-            if data.get('result') != 'success':
5f720ea
-                raise ValueError(
5f720ea
-                    'Failed to login (XHR failed), check username and password')
5f720ea
-            br.set_cookie('m', data['username'], '.wsj.com')
5f720ea
-            try:
5f720ea
-                r = br.open(data['url'])
5f720ea
-            except Exception:
5f720ea
-                self.log.error('Failed to open login url: {}'.format(data['url']))
5f720ea
-                raise
5f720ea
-            self.wsj_itp_page = raw = r.read()
5f720ea
+                'X-Remote-User': self.username
5f720ea
+            }, data=request_query)
5f720ea
+            self.log('Sending login request...')
5f720ea
+            res = br.open(rq)
5f720ea
+            if res.code != 200:
5f720ea
+                raise ValueError('Failed to login, check your username and password')
5f720ea
+            br.select_form(nr=0)
5f720ea
+            self.log('Performing login callback...')
5f720ea
+            res = br.submit()
5f720ea
+            self.wsj_itp_page = raw = res.read()
5f720ea
             if b'>Sign Out<' not in raw:
5f720ea
                 raise ValueError(
5f720ea
-                    'Failed to login (auth URL failed), check username and password')
5f720ea
-            # open('/t/raw.html', 'w').write(raw)
5f720ea
+                    'Failed to login (callback URL failed), check username and password')
5f720ea
             return br
5f720ea
     else:
5f720ea
         def get_browser(self, *a, **kw):
5f720ea
diff --git a/recipes/wsj_free.recipe b/recipes/wsj_free.recipe
5f720ea
index e04e210114..25726c0ca3 100644
5f720ea
--- a/recipes/wsj_free.recipe
5f720ea
+++ b/recipes/wsj_free.recipe
5f720ea
@@ -5,10 +5,7 @@
5f720ea
 from __future__ import absolute_import, division, print_function, unicode_literals
5f720ea
 
5f720ea
 import json
5f720ea
-try:
5f720ea
-    from urllib.parse import quote
5f720ea
-except ImportError:
5f720ea
-    from urllib import quote
5f720ea
+from base64 import standard_b64encode
5f720ea
 
5f720ea
 from mechanize import Request
5f720ea
 
5f720ea
@@ -16,6 +13,16 @@
5f720ea
 from calibre.web.feeds.news import BasicNewsRecipe
5f720ea
 from css_selectors import Select
5f720ea
 
5f720ea
+try:
5f720ea
+    import urllib.parse as urlparse
5f720ea
+except ImportError:
5f720ea
+    import urlparse
5f720ea
+try:
5f720ea
+    from urllib.parse import quote
5f720ea
+except ImportError:
5f720ea
+    from urllib import quote
5f720ea
+
5f720ea
+
5f720ea
 needs_subscription = False
5f720ea
 
5f720ea
 
5f720ea
@@ -40,7 +47,7 @@ class WSJ(BasicNewsRecipe):
5f720ea
     ignore_duplicate_articles = {'url'}
5f720ea
     remove_attributes = ['style', 'data-scrim']
5f720ea
     needs_subscription = needs_subscription
5f720ea
-    WSJ_ITP = 'https://online.wsj.com/itp/today'
5f720ea
+    WSJ_ITP = 'https://www.wsj.com/print-edition/today'
5f720ea
 
5f720ea
     keep_only_tags = [
5f720ea
         dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')),
5f720ea
@@ -87,51 +94,56 @@ def get_cover_url(self):
5f720ea
     # login {{{
5f720ea
     if needs_subscription:
5f720ea
         def get_browser(self, *a, **kw):
5f720ea
-            # To understand the signin logic read signin.js from
5f720ea
-            # https://id.wsj.com/access/pages/wsj/us/signin.html
5f720ea
-            # This is the same login servie as used by Barrons
5f720ea
+            # To understand the login logic read app-min.js from
5f720ea
+            # https://sso.accounts.dowjones.com/login
5f720ea
+            itp = quote(self.WSJ_ITP, safe='')
5f720ea
+            start_url = 'https://accounts.wsj.com/login?target=' + itp
5f720ea
             kw['user_agent'] = random_user_agent(allow_ie=False)
5f720ea
             br = BasicNewsRecipe.get_browser(self, *a, **kw)
5f720ea
-            # self.wsj_itp_page = open('/t/raw.html').read()
5f720ea
-            # return br
5f720ea
-            url = 'https://id.wsj.com/access/pages/wsj/us/signin.html?mg=com-wsj&mg=id-wsj'
5f720ea
-            # br.set_debug_http(True)
5f720ea
-            br.open(url).read()
5f720ea
-            rurl = 'https://id.wsj.com/auth/submitlogin.json'
5f720ea
-            rq = Request(rurl, headers={
5f720ea
-                'Accept': 'application/json, text/javascript, */*; q=0.01',
5f720ea
+            self.log('Starting login process...')
5f720ea
+            res = br.open(start_url)
5f720ea
+            sso_url = res.geturl()
5f720ea
+            query =  urlparse.parse_qs(urlparse.urlparse(sso_url).query)
5f720ea
+            query = {k:v[0] for k, v in query.items()}
5f720ea
+            request_query = {
5f720ea
+                'username': self.username,
5f720ea
+                'password': self.password,
5f720ea
+                'client_id': query['client'],
5f720ea
+                'sso': 'true',
5f720ea
+                'tenant': 'sso',
5f720ea
+                '_intstate': 'deprecated',
5f720ea
+            }
5f720ea
+            for k in 'scope connection nonce state ui_locales ns protocol redirect_uri'.split():
5f720ea
+                request_query[k] = query[k]
5f720ea
+            login_url = 'https://sso.accounts.dowjones.com/usernamepassword/login'
5f720ea
+            # you can get the version below from lib-min.js
5f720ea
+            # search for: str: "x.x.x"
5f720ea
+            # This might need to be updated in the future
5f720ea
+            auth0_client = json.dumps({"name": "auth0.js", "version": "7.0.3"})
5f720ea
+            if not isinstance(auth0_client, bytes):
5f720ea
+                auth0_client = auth0_client.encode('utf-8')
5f720ea
+            auth0_client = standard_b64encode(auth0_client)
5f720ea
+            if isinstance(auth0_client, bytes):
5f720ea
+                auth0_client = auth0_client.decode('ascii')
5f720ea
+            rq = Request(login_url, headers={
5f720ea
+                'Accept': 'text/html',
5f720ea
                 'Accept-Language': 'en-US,en;q=0.8',
5f720ea
-                'Content-Type': 'application/json',
5f720ea
-                'Referer': url,
5f720ea
+                'Auth0-Client': auth0_client.rstrip('='),
5f720ea
                 'X-HTTP-Method-Override': 'POST',
5f720ea
                 'X-Requested-With': 'XMLHttpRequest',
5f720ea
-            }, data=json.dumps({
5f720ea
-                'username': self.username,
5f720ea
-                'password': self.password,
5f720ea
-                'realm': 'default',
5f720ea
-                'savelogin': 'true',
5f720ea
-                'template': 'default',
5f720ea
-                'url': quote(self.WSJ_ITP),
5f720ea
-            }))
5f720ea
-            r = br.open(rq)
5f720ea
-            if r.code != 200:
5f720ea
-                raise ValueError('Failed to login, check username and password')
5f720ea
-            data = json.loads(r.read())
5f720ea
-            # print(data)
5f720ea
-            if data.get('result') != 'success':
5f720ea
-                raise ValueError(
5f720ea
-                    'Failed to login (XHR failed), check username and password')
5f720ea
-            br.set_cookie('m', data['username'], '.wsj.com')
5f720ea
-            try:
5f720ea
-                r = br.open(data['url'])
5f720ea
-            except Exception:
5f720ea
-                self.log.error('Failed to open login url: {}'.format(data['url']))
5f720ea
-                raise
5f720ea
-            self.wsj_itp_page = raw = r.read()
5f720ea
+                'X-Remote-User': self.username
5f720ea
+            }, data=request_query)
5f720ea
+            self.log('Sending login request...')
5f720ea
+            res = br.open(rq)
5f720ea
+            if res.code != 200:
5f720ea
+                raise ValueError('Failed to login, check your username and password')
5f720ea
+            br.select_form(nr=0)
5f720ea
+            self.log('Performing login callback...')
5f720ea
+            res = br.submit()
5f720ea
+            self.wsj_itp_page = raw = res.read()
5f720ea
             if b'>Sign Out<' not in raw:
5f720ea
                 raise ValueError(
5f720ea
-                    'Failed to login (auth URL failed), check username and password')
5f720ea
-            # open('/t/raw.html', 'w').write(raw)
5f720ea
+                    'Failed to login (callback URL failed), check username and password')
5f720ea
             return br
5f720ea
     else:
5f720ea
         def get_browser(self, *a, **kw):