Blob Blame History Raw
From af8d024e4380ad66f4bbf3d96e21d42c2304e9e5 Mon Sep 17 00:00:00 2001
From: josue <josue.salazar@sapientrazorfish.com>
Date: Tue, 13 Aug 2019 21:08:05 -0400
Subject: [PATCH 65/71] Updated The Globe and Mail recipe: article titles no
 longer include section the article belongs to, added article meta data to
 remove tags because it was taking up almost 1 full page of dummy text, added
 real estate section, updated newspaper name to its proper one

---
 recipes/globe_and_mail.recipe | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/recipes/globe_and_mail.recipe b/recipes/globe_and_mail.recipe
index 35e8d75daf..58f4f3e384 100644
--- a/recipes/globe_and_mail.recipe
+++ b/recipes/globe_and_mail.recipe
@@ -18,7 +18,7 @@ def classes(classes):
 
 
 class GlobeMail(BasicNewsRecipe):
-    title = u'Globe & Mail'
+    title = u'The Globe and Mail'
     __author__ = 'Kovid Goyal'
     encoding = 'utf-8'
     publisher = 'Globe & Mail'
@@ -32,12 +32,12 @@ class GlobeMail(BasicNewsRecipe):
             dict(name='main', attrs={'class': lambda x: x and 'article-primary-content-chain' in x.split()}),
     ]
     remove_tags = [
-            classes('c-ad pb-f-commercial-dfp-ads pb-f-article-actions'),
+            classes('c-ad pb-f-commercial-dfp-ads pb-f-article-actions pb-f-article-meta'),
     ]
 
     def parse_index(self):
         ans = []
-        for section in 'canada opinion politics sports life arts world'.split():
+        for section in 'canada opinion politics sports life arts world real-estate'.split():
             if self.test and len(ans) >= self.test[0]:
                 break
             soup = self.index_to_soup('https://www.theglobeandmail.com/{}/'.format(section))
@@ -49,7 +49,8 @@ def parse_index(self):
 
     def parse_gm_section(self, soup):
         for a in soup.findAll('a', href=True, attrs={'data-lt-lid': lambda x: x and x.startswith('Headline.')}):
-            title = self.tag_to_string(a)
+            headline = a.find('div', 'c-card__hed-text')
+            title = self.tag_to_string(headline)
             url = absolutize(a['href'])
             self.log('  ', title, 'at', url)
             yield {'title': title, 'url': url}