Переглянути джерело

Just import the strictly necessary things, put #

master
Xavier 4 роки тому
джерело
коміт
764fc4b368
1 змінених файлів з 12 додано та 11 видалено
  1. +12
    -11
      excerpt_extractor.py

+ 12
- 11
excerpt_extractor.py Переглянути файл

@@ -1,3 +1,4 @@
#!/usr/bin/env python2
"""
This is a simple script to automatically extract excerpts from articles. It
requires BeautifulSoup.
@@ -12,11 +13,14 @@ url = "http://someurl.com/goes/here"
Some examples, discussion, and comparison with the Facebook article extractor
are at http://blog.davidziegler.net/post/122176962/a-python-script-to-automatically-extract-excerpts-from

copyright: Copyright 2009 by David Ziegler
copyright:
Copyright 2009 by David Ziegler <david.ziegler@gmail.com>,
Copyright 2015 Xavier <somenxavier@gmail.com>
license: MIT License
website: http://github.com/dziegler/excerpt_extractor/tree/master
"""
from BeautifulSoup import *

from bs4 import BeautifulSoup, SoupStrainer, Comment
import urllib2
import cookielib
import re
@@ -40,7 +44,7 @@ def get_summary(url):
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
doc = opener.open(url).read()
soup = cleanSoup(BeautifulSoup(doc,parseOnlyThese=SoupStrainer('head')))
soup = cleanSoup(BeautifulSoup(doc,parse_only=SoupStrainer('head')))
if not soup:
raise ValueError, "Invalid output: %s" % url
@@ -57,20 +61,17 @@ def get_summary(url):
break
if not description:
soup = removeHeaders(cleanSoup(BeautifulSoup(doc,parseOnlyThese=SoupStrainer('body'))))
soup = removeHeaders(cleanSoup(BeautifulSoup(doc,parse_only=SoupStrainer('body'))))
text = ''.join(soup.findAll(text=True)).split('\n')
description = max((len(i.strip()),i) for i in text)[1].strip()[0:255]

return (title, description)

if __name__ == "__main__":
urllist=("http://www.sfgate.com/cgi-bin/article.cgi?f=/c/a/2009/06/04/DD7V1806SV.DTL&type=performance",
"http://www.chloeveltman.com/blog/2009/05/two-very-different-symphonies.html#links",
"http://www.chloeveltman.com/blog/2009/06/child-prodigy-at-peabody-essex-museum.html#links",
"http://www.sfgate.com/cgi-bin/article.cgi?f=/c/a/2009/06/04/NS9617O7JK.DTL&type=performance",
"http://blogs.mercurynews.com/aei/2009/06/04/ramya-auroprem-joins-cast-of-spelling-bee/",
"http://www.mercurynews.com/karendsouza/ci_12510394",
"http://www.reason.com/news/show/134059.html")
urllist=("http://www.crummy.com/software/BeautifulSoup/bs4/doc/",
"http://www.google.com",
"http://www.somenxavier.xyz",
"http://www.reason.com/news/show/134059.html")
for u in urllist:
print get_summary(u)[1] + '\n'

Завантаження…
Відмінити
Зберегти