Blame 0065-Updated-The-Globe-and-Mail-recipe-article-titles-no-.patch

5f720ea
From af8d024e4380ad66f4bbf3d96e21d42c2304e9e5 Mon Sep 17 00:00:00 2001
5f720ea
From: josue <josue.salazar@sapientrazorfish.com>
5f720ea
Date: Tue, 13 Aug 2019 21:08:05 -0400
5f720ea
Subject: [PATCH 65/71] Updated The Globe and Mail recipe: article titles no
5f720ea
 longer include section the article belongs to, added article meta data to
5f720ea
 remove tags because it was taking up almost 1 full page of dummy text, added
5f720ea
 real estate section, updated newspaper name to its proper one
5f720ea
5f720ea
---
5f720ea
 recipes/globe_and_mail.recipe | 9 +++++----
5f720ea
 1 file changed, 5 insertions(+), 4 deletions(-)
5f720ea
5f720ea
diff --git a/recipes/globe_and_mail.recipe b/recipes/globe_and_mail.recipe
5f720ea
index 35e8d75daf..58f4f3e384 100644
5f720ea
--- a/recipes/globe_and_mail.recipe
5f720ea
+++ b/recipes/globe_and_mail.recipe
5f720ea
@@ -18,7 +18,7 @@ def classes(classes):
5f720ea
 
5f720ea
 
5f720ea
 class GlobeMail(BasicNewsRecipe):
5f720ea
-    title = u'Globe & Mail'
5f720ea
+    title = u'The Globe and Mail'
5f720ea
     __author__ = 'Kovid Goyal'
5f720ea
     encoding = 'utf-8'
5f720ea
     publisher = 'Globe & Mail'
5f720ea
@@ -32,12 +32,12 @@ class GlobeMail(BasicNewsRecipe):
5f720ea
             dict(name='main', attrs={'class': lambda x: x and 'article-primary-content-chain' in x.split()}),
5f720ea
     ]
5f720ea
     remove_tags = [
5f720ea
-            classes('c-ad pb-f-commercial-dfp-ads pb-f-article-actions'),
5f720ea
+            classes('c-ad pb-f-commercial-dfp-ads pb-f-article-actions pb-f-article-meta'),
5f720ea
     ]
5f720ea
 
5f720ea
     def parse_index(self):
5f720ea
         ans = []
5f720ea
-        for section in 'canada opinion politics sports life arts world'.split():
5f720ea
+        for section in 'canada opinion politics sports life arts world real-estate'.split():
5f720ea
             if self.test and len(ans) >= self.test[0]:
5f720ea
                 break
5f720ea
             soup = self.index_to_soup('https://www.theglobeandmail.com/{}/'.format(section))
5f720ea
@@ -49,7 +49,8 @@ def parse_index(self):
5f720ea
 
5f720ea
     def parse_gm_section(self, soup):
5f720ea
         for a in soup.findAll('a', href=True, attrs={'data-lt-lid': lambda x: x and x.startswith('Headline.')}):
5f720ea
-            title = self.tag_to_string(a)
5f720ea
+            headline = a.find('div', 'c-card__hed-text')
5f720ea
+            title = self.tag_to_string(headline)
5f720ea
             url = absolutize(a['href'])
5f720ea
             self.log('  ', title, 'at', url)
5f720ea
             yield {'title': title, 'url': url}