2011-09-30
使用python脚本下载www.wuxia.net.cn上的书籍，并且将它们合并成一个文件

如题，太累了，直接贴上source，应该能看明白，不明白的话，请给我回复，我会及时回复。
# -*- coding: cp936 -*-
import sgmllib
import urllib2
import HTMLParser
import codecs

class BookParser(sgmllib.SGMLParser):
    def __init__(self):

        # inherit from the SGMLParser class
        sgmllib.SGMLParser.__init__(self)

        # create a list this will store all the links found
        self.links = []
        self.bookNames = []
        self.inside_a_element = 0
        self.count = 0;

    def unknown_starttag(self, tag, attrs):
        #print "unknown tag start " + tag
        for key, value in attrs:
            if key.lower() == "href":
                if "/book/" in value and ".html" in value:
                    self.links.append(value)
                    self.inside_a_element = 1


    def unknown_endtag(self, tag):
        #print "in end tag"
        if self.inside_a_element:
            self.inside_a_element = 0
            self.count += 1

    def handle_data(self, text):
        #print text
        if self.inside_a_element:
            text = unicode(text, 'utf-8')
            text = text + str(self.count) + ".txt"
            self.bookNames.append(text)
            codecs.open(text, 'a', "utf-8")

class PageParser(sgmllib.SGMLParser):
    def __init__(self):

        # inherit from the SGMLParser class
        sgmllib.SGMLParser.__init__(self)

        # create a list this will store all the links found
        self.links = []

    def unknown_starttag(self, tag, attrs):
        #print "unknown tag start " + tag
        for key, value in attrs:
            if key.lower() == "href":
                if "/book/" in value and ".html" in value:
                    self.links.append(value)


class ContentParser(sgmllib.SGMLParser):
    def __init__(self, bookNames):

        # inherit from the SGMLParser class
        sgmllib.SGMLParser.__init__(self)

        # create a list this will store all the divs(page content) found
        self.divs = []
        # create a list this will store all the h1(title) found
        self.headone = []
        print " in content parser " + bookNames
        self.logfile = codecs.open(bookNames, 'a', "utf-8")
        self.inside_a_element = 0
        self.h = HTMLParser.HTMLParser()

    # this function is called once an anchor tag is found

    def unknown_starttag(self, tag, attrs):
        #print "unknown tag start " + tag
        if tag.lower() == "h1":
            self.inside_a_element = 1
        if tag.lower() == "p":
            self.inside_a_element = 2

    def unknown_endtag(self, tag):
        if self.inside_a_element:
            self.inside_a_element = 0
            self.logfile.write("\n")

    def handle_data(self, text):
        #print "handle data "  + text
        if self.inside_a_element == 1:
            text = unicode(text, 'utf-8')
            self.logfile.write(text + "\n\n")

        if self.inside_a_element == 2:
            text = unicode(text, 'utf-8')
            self.logfile.write(text)

    def handle_charref(self, ref):
        print "#############"
        print "chart is " + ref
        print self.h("&#%(ref)")
    def handle_entityref(self, ref):
        self.logfile.write(self.h.unescape("&"+ref+";"))
        #print "#############"
        #print "enttity is + " + ref
        #print self.h.unescape("&"+ref+";")

def getBookList(url):
    bookDict = {}
    sock = urllib2.urlopen(url)
    # make sure the string that is going to be parsed is 8-bit ascii
    if sock.info().dict['content-type'] == 'text/html':
        parser = BookParser()
        parser.feed(sock.read())
        bookDic = dict(zip(parser.links, parser.bookNames))
    return bookDic


def getPageList(bookUrl):
    pageList = []
    sock = urllib2.urlopen(bookUrl)
    # make sure the string that is going to be parsed is 8-bit ascii
    if sock.info().dict['content-type'] == 'text/html':
        parser = PageParser()
        parser.feed(sock.read())
        pageList = parser.links

    return pageList

def getPageContent(pageUrl, bookNames):
    sock = urllib2.urlopen(pageUrl)
    # make sure the string that is going to be parsed is 8-bit ascii
    if True:
        parser = ContentParser(bookNames)
        parser.feed(sock.read())
        # print out links
        for link in parser.divs:
            print link


def main(wuxiaUrl):
    bookDic = getBookList(wuxiaUrl)
    print type(bookDic)
    print bookDic
    for link, bookNames in bookDic.iteritems():
        print "link is " + link
        print "bookNames is " + bookNames
        pageList = getPageList("http://www.wuxia.net.cn" + link)
        for page in pageList:
            getPageContent("http://www.wuxia.net.cn" + page, bookNames)


if __name__ == '__main__':
    # this str is author's page
    main("http://www.wuxia.net.cn/author/shiweihan.html")
    #if len(sys.argv) < 2:
    #    print("Usage: %s xuxiaurl"%sys.argv[0])
    #else:
    #    main(sys.argv[1])
贾亮的博客

Do Something

使用python脚本下载www.wuxia.net.cn上的书籍，并且将它们合并成一个文件