如题,太累了,直接贴上source,应该能看明白,不明白的话,请给我回复,我会及时回复。
# -*- coding: cp936 -*-
import sgmllib
import urllib2
import HTMLParser
import codecs
class BookParser(sgmllib.SGMLParser):
def __init__(self):
# inherit from the SGMLParser class
sgmllib.SGMLParser.__init__(self)
# create a list this will store all the links found
self.links = []
self.bookNames = []
self.inside_a_element = 0
self.count = 0;
def unknown_starttag(self, tag, attrs):
#print "unknown tag start " + tag
for key, value in attrs:
if key.lower() == "href":
if "/book/" in value and ".html" in value:
self.links.append(value)
self.inside_a_element = 1
def unknown_endtag(self, tag):
#print "in end tag"
if self.inside_a_element:
self.inside_a_element = 0
self.count += 1
def handle_data(self, text):
#print text
if self.inside_a_element:
text = unicode(text, 'utf-8')
text = text + str(self.count) + ".txt"
self.bookNames.append(text)
codecs.open(text, 'a', "utf-8")
class PageParser(sgmllib.SGMLParser):
def __init__(self):
# inherit from the SGMLParser class
sgmllib.SGMLParser.__init__(self)
# create a list this will store all the links found
self.links = []
def unknown_starttag(self, tag, attrs):
#print "unknown tag start " + tag
for key, value in attrs:
if key.lower() == "href":
if "/book/" in value and ".html" in value:
self.links.append(value)
class ContentParser(sgmllib.SGMLParser):
def __init__(self, bookNames):
# inherit from the SGMLParser class
sgmllib.SGMLParser.__init__(self)
# create a list this will store all the divs(page content) found
self.divs = []
# create a list this will store all the h1(title) found
self.headone = []
print " in content parser " + bookNames
self.logfile = codecs.open(bookNames, 'a', "utf-8")
self.inside_a_element = 0
self.h = HTMLParser.HTMLParser()
# this function is called once an anchor tag is found
def unknown_starttag(self, tag, attrs):
#print "unknown tag start " + tag
if tag.lower() == "h1":
self.inside_a_element = 1
if tag.lower() == "p":
self.inside_a_element = 2
def unknown_endtag(self, tag):
if self.inside_a_element:
self.inside_a_element = 0
self.logfile.write("\n")
def handle_data(self, text):
#print "handle data " + text
if self.inside_a_element == 1:
text = unicode(text, 'utf-8')
self.logfile.write(text + "\n\n")
if self.inside_a_element == 2:
text = unicode(text, 'utf-8')
self.logfile.write(text)
def handle_charref(self, ref):
print "#############"
print "chart is " + ref
print self.h("&#%(ref)")
def handle_entityref(self, ref):
self.logfile.write(self.h.unescape("&"+ref+";"))
#print "#############"
#print "enttity is + " + ref
#print self.h.unescape("&"+ref+";")
def getBookList(url):
bookDict = {}
sock = urllib2.urlopen(url)
# make sure the string that is going to be parsed is 8-bit ascii
if sock.info().dict['content-type'] == 'text/html':
parser = BookParser()
parser.feed(sock.read())
bookDic = dict(zip(parser.links, parser.bookNames))
return bookDic
def getPageList(bookUrl):
pageList = []
sock = urllib2.urlopen(bookUrl)
# make sure the string that is going to be parsed is 8-bit ascii
if sock.info().dict['content-type'] == 'text/html':
parser = PageParser()
parser.feed(sock.read())
pageList = parser.links
return pageList
def getPageContent(pageUrl, bookNames):
sock = urllib2.urlopen(pageUrl)
# make sure the string that is going to be parsed is 8-bit ascii
if True:
parser = ContentParser(bookNames)
parser.feed(sock.read())
# print out links
for link in parser.divs:
print link
def main(wuxiaUrl):
bookDic = getBookList(wuxiaUrl)
print type(bookDic)
print bookDic
for link, bookNames in bookDic.iteritems():
print "link is " + link
print "bookNames is " + bookNames
pageList = getPageList("http://www.wuxia.net.cn" + link)
for page in pageList:
getPageContent("http://www.wuxia.net.cn" + page, bookNames)
if __name__ == '__main__':
# this str is author's page
main("http://www.wuxia.net.cn/author/shiweihan.html")
#if len(sys.argv) < 2:
# print("Usage: %s xuxiaurl"%sys.argv[0])
#else:
# main(sys.argv[1])