Python script to automatically extract excerpts from articles
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

77 lines
2.5 KiB

#!/usr/bin/env python2
"""
This is a simple script to automatically extract excerpts from articles. It
requires BeautifulSoup.
Usage:
from excerpt_extractor import get_summary
url = "http://someurl.com/goes/here"
(title,description) = get_summary(url)
==========================================
Some examples, discussion, and comparison with the Facebook article extractor
are at http://blog.davidziegler.net/post/122176962/a-python-script-to-automatically-extract-excerpts-from
copyright:
Copyright 2009 by David Ziegler <david.ziegler@gmail.com>,
Copyright 2015 Xavier <somenxavier@gmail.com>
license: MIT License
website: http://github.com/dziegler/excerpt_extractor/tree/master
"""
from bs4 import BeautifulSoup, SoupStrainer, Comment
import urllib2
import cookielib
import re
def cleanSoup(soup):
# get rid of javascript, noscript and css
[[tree.extract() for tree in soup(elem)] for elem in ('script','noscript','style')]
# get rid of doctype
subtree = soup.findAll(text=re.compile("DOCTYPE"))
[tree.extract() for tree in subtree]
# get rid of comments
comments = soup.findAll(text=lambda text:isinstance(text,Comment))
[comment.extract() for comment in comments]
return soup
def removeHeaders(soup):
[[tree.extract() for tree in soup(elem)] for elem in ('h1','h2','h3','h4','h5','h6')]
return soup
def get_summary(url):
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
doc = opener.open(url).read()
soup = cleanSoup(BeautifulSoup(doc,parse_only=SoupStrainer('head')))
if not soup:
raise ValueError, "Invalid output: %s" % url
try:
title = soup.head.title.string
except:
title = None
description = ''
for meta in soup.findAll('meta'):
if 'description' == meta.get('name', '').lower():
description = meta['content']
break
if not description:
soup = removeHeaders(cleanSoup(BeautifulSoup(doc,parse_only=SoupStrainer('body'))))
text = ''.join(soup.findAll(text=True)).split('\n')
description = max((len(i.strip()),i) for i in text)[1].strip()[0:255]
return (title, description)
if __name__ == "__main__":
urllist=("http://www.crummy.com/software/BeautifulSoup/bs4/doc/",
"http://www.google.com",
"http://www.somenxavier.xyz",
"http://www.reason.com/news/show/134059.html")
for u in urllist:
print get_summary(u)[1] + '\n'