Python script to automatically extract excerpts from articles
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

77 lines
2.5 KiB

#!/usr/bin/env python2
This is a simple script to automatically extract excerpts from articles. It
requires BeautifulSoup.
from excerpt_extractor import get_summary
url = ""
(title,description) = get_summary(url)
Some examples, discussion, and comparison with the Facebook article extractor
are at
Copyright 2009 by David Ziegler <>,
Copyright 2015 Xavier <>
license: MIT License
from bs4 import BeautifulSoup, SoupStrainer, Comment
import urllib2
import cookielib
import re
def cleanSoup(soup):
# get rid of javascript, noscript and css
[[tree.extract() for tree in soup(elem)] for elem in ('script','noscript','style')]
# get rid of doctype
subtree = soup.findAll(text=re.compile("DOCTYPE"))
[tree.extract() for tree in subtree]
# get rid of comments
comments = soup.findAll(text=lambda text:isinstance(text,Comment))
[comment.extract() for comment in comments]
return soup
def removeHeaders(soup):
[[tree.extract() for tree in soup(elem)] for elem in ('h1','h2','h3','h4','h5','h6')]
return soup
def get_summary(url):
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
doc =
soup = cleanSoup(BeautifulSoup(doc,parse_only=SoupStrainer('head')))
if not soup:
raise ValueError, "Invalid output: %s" % url
title = soup.head.title.string
title = None
description = ''
for meta in soup.findAll('meta'):
if 'description' == meta.get('name', '').lower():
description = meta['content']
if not description:
soup = removeHeaders(cleanSoup(BeautifulSoup(doc,parse_only=SoupStrainer('body'))))
text = ''.join(soup.findAll(text=True)).split('\n')
description = max((len(i.strip()),i) for i in text)[1].strip()[0:255]
return (title, description)
if __name__ == "__main__":
for u in urllist:
print get_summary(u)[1] + '\n'