Python script to automatically extract excerpts from articles
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

78 lines
2.5KB

  1. #!/usr/bin/env python2
  2. """
  3. This is a simple script to automatically extract excerpts from articles. It
  4. requires BeautifulSoup.
  5. Usage:
  6. from excerpt_extractor import get_summary
  7. url = "http://someurl.com/goes/here"
  8. (title,description) = get_summary(url)
  9. ==========================================
  10. Some examples, discussion, and comparison with the Facebook article extractor
  11. are at http://blog.davidziegler.net/post/122176962/a-python-script-to-automatically-extract-excerpts-from
  12. copyright:
  13. Copyright 2009 by David Ziegler <david.ziegler@gmail.com>,
  14. Copyright 2015 Xavier <somenxavier@gmail.com>
  15. license: MIT License
  16. website: http://github.com/dziegler/excerpt_extractor/tree/master
  17. """
  18. from bs4 import BeautifulSoup, SoupStrainer, Comment
  19. import urllib2
  20. import cookielib
  21. import re
  22. def cleanSoup(soup):
  23. # get rid of javascript, noscript and css
  24. [[tree.extract() for tree in soup(elem)] for elem in ('script','noscript','style')]
  25. # get rid of doctype
  26. subtree = soup.findAll(text=re.compile("DOCTYPE"))
  27. [tree.extract() for tree in subtree]
  28. # get rid of comments
  29. comments = soup.findAll(text=lambda text:isinstance(text,Comment))
  30. [comment.extract() for comment in comments]
  31. return soup
  32. def removeHeaders(soup):
  33. [[tree.extract() for tree in soup(elem)] for elem in ('h1','h2','h3','h4','h5','h6')]
  34. return soup
  35. def get_summary(url):
  36. cj = cookielib.CookieJar()
  37. opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
  38. doc = opener.open(url).read()
  39. soup = cleanSoup(BeautifulSoup(doc,parse_only=SoupStrainer('head')))
  40. if not soup:
  41. raise ValueError, "Invalid output: %s" % url
  42. try:
  43. title = soup.head.title.string
  44. except:
  45. title = None
  46. description = ''
  47. for meta in soup.findAll('meta'):
  48. if 'description' == meta.get('name', '').lower():
  49. description = meta['content']
  50. break
  51. if not description:
  52. soup = removeHeaders(cleanSoup(BeautifulSoup(doc,parse_only=SoupStrainer('body'))))
  53. text = ''.join(soup.findAll(text=True)).split('\n')
  54. description = max((len(i.strip()),i) for i in text)[1].strip()[0:255]
  55. return (title, description)
  56. if __name__ == "__main__":
  57. urllist=("http://www.crummy.com/software/BeautifulSoup/bs4/doc/",
  58. "http://www.google.com",
  59. "http://www.somenxavier.xyz",
  60. "http://www.reason.com/news/show/134059.html")
  61. for u in urllist:
  62. print get_summary(u)[1] + '\n'