Python script to automatically extract excerpts from articles
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

95 lines
3.3 KiB

  1. """
  2. This is a simple script to automatically extract excerpts from articles. It
  3. requires BeautifulSoup.
  4. Usage:
  5. from excerpt_extractor import get_summary
  6. url = "http://someurl.com/goes/here"
  7. (title,description) = get_summary(url)
  8. ==========================================
  9. Some examples, discussion, and comparison with the Facebook article extractor
  10. are at http://blog.davidziegler.net/post/122176962/a-python-script-to-automatically-extract-excerpts-from
  11. copyright: Copyright 2009 by David Ziegler
  12. license: MIT License
  13. website: http://github.com/dziegler/excerpt_extractor/tree/master
  14. """
  15. from BeautifulSoup import *
  16. import urllib2
  17. import cookielib
  18. import re
  19. def cleanSoup(soup):
  20. # get rid of javascript
  21. subtree = soup('script')
  22. [tree.extract() for tree in subtree]
  23. # get rid of noscript
  24. subtree = soup('noscript')
  25. [tree.extract() for tree in subtree]
  26. # get rid of css
  27. subtree = soup('style')
  28. [tree.extract() for tree in subtree]
  29. # get rid of doctype
  30. subtree = soup.findAll(text=re.compile("DOCTYPE"))
  31. [tree.extract() for tree in subtree]
  32. # get rid of comments
  33. comments = soup.findAll(text=lambda text:isinstance(text,Comment))
  34. [comment.extract() for comment in comments]
  35. return soup
  36. def removeHeaders(soup):
  37. subtree = soup('h1')
  38. [tree.extract() for tree in subtree]
  39. subtree = soup('h2')
  40. [tree.extract() for tree in subtree]
  41. subtree = soup('h3')
  42. [tree.extract() for tree in subtree]
  43. subtree = soup('h4')
  44. [tree.extract() for tree in subtree]
  45. subtree = soup('h5')
  46. [tree.extract() for tree in subtree]
  47. subtree = soup('h6')
  48. [tree.extract() for tree in subtree]
  49. return soup
  50. def get_summary(url):
  51. cj = cookielib.CookieJar()
  52. opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
  53. doc = opener.open(url).read()
  54. soup = cleanSoup(BeautifulSoup(doc,parseOnlyThese=SoupStrainer('head')))
  55. if not soup.get_starttag_text():
  56. print "Invalid input"
  57. return None
  58. try:
  59. title = soup.head.title.string
  60. except:
  61. title = None
  62. description = ''
  63. for meta in soup.findAll('meta'):
  64. if 'description' == meta.get('name', '').lower():
  65. description = meta['content']
  66. break
  67. if not description:
  68. soup = removeHeaders(cleanSoup(BeautifulSoup(doc,parseOnlyThese=SoupStrainer('body'))))
  69. text = ''.join(soup.findAll(text=True)).split('\n')
  70. description = max((len(i.strip()),i) for i in text)[1].strip()[0:255]
  71. return (title, description)
  72. if __name__ == "__main__":
  73. urllist=("http://www.sfgate.com/cgi-bin/article.cgi?f=/c/a/2009/06/04/DD7V1806SV.DTL&type=performance",
  74. "http://www.chloeveltman.com/blog/2009/05/two-very-different-symphonies.html#links",
  75. "http://www.chloeveltman.com/blog/2009/06/child-prodigy-at-peabody-essex-museum.html#links",
  76. "http://www.sfgate.com/cgi-bin/article.cgi?f=/c/a/2009/06/04/NS9617O7JK.DTL&type=performance",
  77. "http://blogs.mercurynews.com/aei/2009/06/04/ramya-auroprem-joins-cast-of-spelling-bee/",
  78. "http://www.mercurynews.com/karendsouza/ci_12510394",
  79. "http://www.reason.com/news/show/134059.html")
  80. for u in urllist:
  81. print get_summary(u)[1] + '\n'