|
|
5f720ea |
From af8d024e4380ad66f4bbf3d96e21d42c2304e9e5 Mon Sep 17 00:00:00 2001
|
|
|
5f720ea |
From: josue <josue.salazar@sapientrazorfish.com>
|
|
|
5f720ea |
Date: Tue, 13 Aug 2019 21:08:05 -0400
|
|
|
5f720ea |
Subject: [PATCH 65/71] Updated The Globe and Mail recipe: article titles no
|
|
|
5f720ea |
longer include section the article belongs to, added article meta data to
|
|
|
5f720ea |
remove tags because it was taking up almost 1 full page of dummy text, added
|
|
|
5f720ea |
real estate section, updated newspaper name to its proper one
|
|
|
5f720ea |
|
|
|
5f720ea |
---
|
|
|
5f720ea |
recipes/globe_and_mail.recipe | 9 +++++----
|
|
|
5f720ea |
1 file changed, 5 insertions(+), 4 deletions(-)
|
|
|
5f720ea |
|
|
|
5f720ea |
diff --git a/recipes/globe_and_mail.recipe b/recipes/globe_and_mail.recipe
|
|
|
5f720ea |
index 35e8d75daf..58f4f3e384 100644
|
|
|
5f720ea |
--- a/recipes/globe_and_mail.recipe
|
|
|
5f720ea |
+++ b/recipes/globe_and_mail.recipe
|
|
|
5f720ea |
@@ -18,7 +18,7 @@ def classes(classes):
|
|
|
5f720ea |
|
|
|
5f720ea |
|
|
|
5f720ea |
class GlobeMail(BasicNewsRecipe):
|
|
|
5f720ea |
- title = u'Globe & Mail'
|
|
|
5f720ea |
+ title = u'The Globe and Mail'
|
|
|
5f720ea |
__author__ = 'Kovid Goyal'
|
|
|
5f720ea |
encoding = 'utf-8'
|
|
|
5f720ea |
publisher = 'Globe & Mail'
|
|
|
5f720ea |
@@ -32,12 +32,12 @@ class GlobeMail(BasicNewsRecipe):
|
|
|
5f720ea |
dict(name='main', attrs={'class': lambda x: x and 'article-primary-content-chain' in x.split()}),
|
|
|
5f720ea |
]
|
|
|
5f720ea |
remove_tags = [
|
|
|
5f720ea |
- classes('c-ad pb-f-commercial-dfp-ads pb-f-article-actions'),
|
|
|
5f720ea |
+ classes('c-ad pb-f-commercial-dfp-ads pb-f-article-actions pb-f-article-meta'),
|
|
|
5f720ea |
]
|
|
|
5f720ea |
|
|
|
5f720ea |
def parse_index(self):
|
|
|
5f720ea |
ans = []
|
|
|
5f720ea |
- for section in 'canada opinion politics sports life arts world'.split():
|
|
|
5f720ea |
+ for section in 'canada opinion politics sports life arts world real-estate'.split():
|
|
|
5f720ea |
if self.test and len(ans) >= self.test[0]:
|
|
|
5f720ea |
break
|
|
|
5f720ea |
soup = self.index_to_soup('https://www.theglobeandmail.com/{}/'.format(section))
|
|
|
5f720ea |
@@ -49,7 +49,8 @@ def parse_index(self):
|
|
|
5f720ea |
|
|
|
5f720ea |
def parse_gm_section(self, soup):
|
|
|
5f720ea |
for a in soup.findAll('a', href=True, attrs={'data-lt-lid': lambda x: x and x.startswith('Headline.')}):
|
|
|
5f720ea |
- title = self.tag_to_string(a)
|
|
|
5f720ea |
+ headline = a.find('div', 'c-card__hed-text')
|
|
|
5f720ea |
+ title = self.tag_to_string(headline)
|
|
|
5f720ea |
url = absolutize(a['href'])
|
|
|
5f720ea |
self.log(' ', title, 'at', url)
|
|
|
5f720ea |
yield {'title': title, 'url': url}
|