下列代码有点长，如何精简？ - VFP论坛

问题点数：0 回复次数：1

下列代码有点长，如何精简？

下列代码能将《仙道九绝》指定章节小说完整地爬下来，但我总觉得代码有点长，不知能否精简（windows xp系统，低版本IE浏览器），现贴上代码，请各位高人指点，谢谢！
【VFP代码】如下：

CLEAR
CLOSE   DATABASES
SET   DEFAULT   TO   (ADDBS(JUSTPATH(SYS(16))))
DECLARE   long   PostMessageA   IN   user32   long, long, long, long
IF   !FILE("仙道九绝.dbf")
    CREATE   TABLE   仙道九绝 (title  C(100), addr  C(254), txt  M)
    USE
ENDIF
USE   仙道九绝   ALIAS   tu
of = CREATEOBJECT("form1")
of.show(1)
CLOSE   TABLES   ALL
CLEAR   ALL
RETURN

DEFINE   CLASS   form1   as   Form
    width = 800
    height = 600
    AutoCenter = .T.
    AllowOutput = .f.
    ADD   OBJECT   but   as   commandbutton   WITH   left=10, top=10, width=100, height=22, caption="开始(第7-11章)"
    ADD   OBJECT   grd   as   grid   WITH   left=10, top=40, width=250, height=550, RecordSource="tu", AllowCellSelection=.f.
    ADD   OBJECT   edt   as   editbox   WITH   left=280, top=40, width=510, height=550
    ADD   OBJECT   web   as Olecontrol   with   OleClass="Shell.Explorer.2", left=-100

    * 分页处理属性
    currentChapterContent = ""   && 存储当前章节的所有内容
    currentPageNumber = 1        && 当前页码
    maxPages = 4                 && 最大尝试页数
    isProcessingChapter = .F.    && 标记是否正在处理章节
    startChapter = 7             && 开始章节
    endChapter = 11              && 结束章节
    originalChapterUrl = ""      && 存储章节原始URL用于分页检测
    currentChapterBase = ""      && 当前章节的基础URL（不含页码）
    processedUrls = ""           && 存储已处理的URL避免重复
    currentChapterTitle = ""     && 当前章节标题

    PROCEDURE   Destroy
        UNBINDEVENTS(this.hWnd)
    ENDPROC

    PROCEDURE   Init
        this.web.Silent = .t.
        BINDEVENT(this.hWnd, 0x401, this, "myMessage")
    ENDPROC

    PROCEDURE   but.click
        ZAP   IN   "tu"
        thisform.edt.value = ""
        thisform.currentChapterContent = ""
        thisform.currentPageNumber = 1
        thisform.isProcessingChapter = .F.
        thisform.processedUrls = ""
        thisform.currentChapterBase = ""
        thisform.currentChapterTitle = ""
        thisform.web.navigate("http://www.)
    ENDPROC

    PROCEDURE   grd.click
        thisform.edt.value = tu.txt
        thisform.edt.SelStart = 0
    ENDPROC

    PROCEDURE   web.documentComplete(pdisp, url)
        IF   (SYS(3095, pdisp) == SYS(3095, this))
            PostMessageA(thisform.hWnd, 0x401, 0, 0)
        ENDIF
    ENDPROC

    FUNCTION   myMessage(hWnd, uMsg, wParam, lParam)
        dom = this.web.document
        IF   VARTYPE(dom)  != "O"
            this.Enabled = .t.
            RETURN
        ENDIF

        * 添加延迟，确保页面完全加载
        INKEY(1.5)

        IF   EMPTY(tu.addr)
            * 获取章节列表
            lis = dom.getElementsByTagName("li")
            LOCAL   nCount, chapterCount
            nCount = 0
            chapterCount = 0

            FOR   EACH   li   IN   lis
                IF   li.classname == "line3"
                    nCount = nCount + 1
                    * 只获取第7章到第11章
                    IF   nCount >= this.startChapter   AND   nCount <= this.endChapter
                        chapterCount = chapterCount + 1
                        INSERT INTO tu (title, addr, txt) VALUES (;
                            li.getElementsByTagName("a").item[0].innertext,;
                            li.getElementsByTagName("a").item[0].href, "";
                        )
                        ? "添加章节: " + li.getElementsByTagName("a").item[0].innertext
                    ENDIF

                    * 如果已经超过结束章节，退出循环
                    IF   nCount > this.endChapter
                        EXIT
                    ENDIF
                ENDIF
            ENDFOR

            IF   RECCOUNT("tu") = 0
                MESSAGEBOX("未找到第" + ALLTRIM(STR(this.startChapter)) + ;
                  "章至第" + ALLTRIM(STR(this.endChapter)) + "章的内容！")
                this.Enabled = .t.
                RETURN
            ENDIF

            GO   TOP   IN   "tu"
            this.grd.setfocus
            this.currentChapterContent = ""
            this.currentPageNumber = 1
            this.isProcessingChapter = .T.
            this.originalChapterUrl = ALLTRIM(tu.addr)
            this.currentChapterBase = this.GetChapterBase(this.originalChapterUrl)
            this.currentChapterTitle = ALLTRIM(tu.title)
            this.processedUrls = this.originalChapterUrl + "|"
            this.web.navigate(this.originalChapterUrl)
            RETURN
        ENDIF

        * 爬取当前章节内容
        IF   !EOF("tu") AND this.isProcessingChapter
            LOCAL   lcContent, lcNextPage, lcCurrentUrl
            lcContent = ""
            lcNextPage = ""
            lcCurrentUrl = this.web.LocationURL

            * 检查当前URL是否属于当前章节
            IF   this.currentChapterBase  != this.GetChapterBase(lcCurrentUrl)
                ? "警告: 检测到URL跳转到其他章节，停止当前章节爬取"
                * 直接保存当前章节内容并移动到下一章
                REPLACE   tu.txt   WITH   this.currentChapterContent
                ? "章节 '" + this.currentChapterTitle + "' 获取完成，共 " + ;
                  ALLTRIM(STR(this.currentPageNumber)) + " 页"

                * 重置分页状态
                this.currentChapterContent = ""
                this.currentPageNumber = 1
                this.processedUrls = ""

                * 移动到下一章
                SKIP   IN   "tu"
                IF   !EOF("tu")
                    this.grd.setfocus
                    this.originalChapterUrl = ALLTRIM(tu.addr)
                    this.currentChapterBase = this.GetChapterBase(this.originalChapterUrl)
                    this.currentChapterTitle = ALLTRIM(tu.title)
                    this.processedUrls = this.originalChapterUrl + "|"
                    this.web.navigate(this.originalChapterUrl)
                    RETURN
                ELSE
                    this.isProcessingChapter = .F.
                    GO   TOP   IN   "tu"
                    this.grd.setfocus
                    this.Enabled = .t.
                    MESSAGEBOX("第" + ALLTRIM(STR(this.startChapter)) + "章至第" + ;
                      ALLTRIM(STR(this.endChapter)) + "章内容爬取完成！")
                ENDIF
                RETURN
            ENDIF

            * 根据HTML结构，从id="chapter"的div中获取内容
            lcContent = this.ExtractChapterContent(dom)

            IF   EMPTY(lcContent)
                ? "警告: 无法提取章节内容，尝试备用方法..."
                lcContent = this.ExtractContentAlternative(dom)
            ENDIF

            * 清理内容
            lcContent = this.CleanContent(lcContent)

            * 将当前页内容追加到章节总内容中
            IF   this.currentPageNumber == 1
                this.currentChapterContent = this.currentChapterTitle + CHR(13) + CHR(10) + ;
                  CHR(13) + CHR(10) + lcContent
            ELSE
                this.currentChapterContent = this.currentChapterContent + CHR(13) + ;
                  CHR(10) + lcContent
            ENDIF

            * 查找下一页链接
            lcNextPage = this.FindNextPageLink(dom, lcCurrentUrl)

            * 如果有下一页且未超过最大页数限制，继续获取下一页
            IF   !EMPTY(lcNextPage) AND this.currentPageNumber < this.maxPages
                this.currentPageNumber = this.currentPageNumber + 1
                this.processedUrls = this.processedUrls + lcNextPage + "|"
                this.web.navigate(lcNextPage)
                ? "章节 '" + this.currentChapterTitle + "' 第 " + ;
                  ALLTRIM(STR(this.currentPageNumber-1)) + " 页获取完成，继续获取下一页..."
                RETURN
            ELSE
                * 没有下一页或达到最大页数，保存完整章节内容
                REPLACE   tu.txt   WITH   this.currentChapterContent

                ? "章节 '" + this.currentChapterTitle + "' 获取完成，共 " + ;
                  ALLTRIM(STR(this.currentPageNumber)) + " 页"

                * 重置分页状态
                this.currentChapterContent = ""
                this.currentPageNumber = 1
                this.processedUrls = ""

                * 移动到下一章
                SKIP   IN   "tu"
                IF   !EOF("tu")
                    this.grd.setfocus
                    this.originalChapterUrl = ALLTRIM(tu.addr)
                    this.currentChapterBase = this.GetChapterBase(this.originalChapterUrl)
                    this.currentChapterTitle = ALLTRIM(tu.title)
                    this.processedUrls = this.originalChapterUrl + "|"
                    this.web.navigate(this.originalChapterUrl)
                    RETURN
                ELSE
                    this.isProcessingChapter = .F.
                    GO   TOP   IN   "tu"
                    this.grd.setfocus
                    this.Enabled = .t.
                    MESSAGEBOX("第" + ALLTRIM(STR(this.startChapter)) + "章至第" + ;
                      ALLTRIM(STR(this.endChapter)) + "章内容爬取完成！")
                ENDIF
            ENDIF
        ENDIF
    ENDFUNC

    * 获取章节基础URL（去除页码部分）
    FUNCTION    GetChapterBase(url)
        LOCAL   lcBase

        lcBase = LOWER(ALLTRIM(url))

        * 移除页码部分，获取章节基础URL
        * 处理模式：/6780946/3664234966.html 和 /6780946/3664234966_2.html
        IF   AT("_", lcBase) > 0   AND   AT(".html", lcBase) > 0
            * 找到最后一个下划线位置
            LOCAL   lnPos
            lnPos = RAT("_", lcBase)
            IF   lnPos > 0
                lcBase = LEFT(lcBase, lnPos-1) + ".html"
            ENDIF
        ENDIF

        RETURN   lcBase
    ENDFUNC

    * 内容提取函数 - 根据HTML结构修正
    FUNCTION   ExtractChapterContent(dom)
        LOCAL   lcContent
        lcContent = ""
        * 3种方法任选一种
        * 方法1: 从id="chapter"的div中获取内容
        TRY
            lcContent = dom.getElementById("chapter").innertext
            IF   !EMPTY(lcContent)
                ? "通过ID 'chapter' 获取内容成功"
                RETURN   lcContent
            ENDIF
        CATCH
        ENDTRY

        * 方法2: 如果上面失败，尝试从id="content"的div中获取
        TRY
            lcContent = dom.getElementById("content").innertext
            IF   !EMPTY(lcContent)
                ? "通过ID 'content' 获取内容成功"
                RETURN   lcContent
            ENDIF
        CATCH
        ENDTRY

        * 方法3: 通过类名获取
        TRY
            divs = dom.getElementsByTagName("div")
            FOR   i = 0   TO   divs.length - 1
                TRY
                    IF   divs.item(i).className == "chapter" OR ;
                       AT("chapter", LOWER(divs.item(i).className)) > 0
                        lcContent = divs.item(i).innertext
                        IF   !EMPTY(lcContent)
                            ? "通过类名 '" + divs.item(i).className + "' 获取内容成功"
                            RETURN   lcContent
                        ENDIF
                    ENDIF
                CATCH
                ENDTRY
            ENDFOR
        CATCH
        ENDTRY

        RETURN   lcContent
    ENDFUNC

    * 备用内容提取方法
    FUNCTION    ExtractContentAlternative(dom)
        LOCAL   lcContent, i, elements, lcElementText, lcBestContent
        LOCAL   lnMaxLength, lnCurrentLength

        lcContent = ""
        lcBestContent = ""
        lnMaxLength = 0

        * 通过文本分析找到最可能是小说内容的元素
        TRY
            elements = dom.getElementsByTagName("div")
            FOR   i = 0   TO   elements.length - 1
                TRY
                    lcElementText = elements.item(i).innertext
                    lnCurrentLength = LEN(ALLTRIM(lcElementText))

                    * 如果这个元素的文本长度超过当前最大值，并且看起来像是正文
                    IF   lnCurrentLength > lnMaxLength AND ;
                       lnCurrentLength > 100 AND ;
                       AT("上一章", lcElementText) = 0   AND ;
                       AT("下一章", lcElementText) = 0   AND ;
                       AT("目录", lcElementText) = 0    AND ;
                       AT("首页", lcElementText) = 0    AND ;
                       AT("友情链接", lcElementText) = 0

                        lnMaxLength = lnCurrentLength
                        lcBestContent = lcElementText
                    ENDIF
                CATCH
                ENDTRY
            ENDFOR

            IF   !EMPTY(lcBestContent)
                ? "通过文本分析获取内容成功，长度: " + ALLTRIM(STR(lnMaxLength))
                RETURN   lcBestContent
            ENDIF
        CATCH
        ENDTRY

        RETURN   lcContent
    ENDFUNC

    * 查找下一页链接 - 改进版本，确保只找到当前章节的下一页
    FUNCTION   FindNextPageLink(dom, currentUrl)
        LOCAL   lcNextPage, i, links, lcHref, lcText

        lcNextPage = ""

        * 方法1: 查找包含"下一章"文本的链接，但需要验证是否属于当前章节
        TRY
            links = dom.getElementsByTagName("a")
            FOR   i = 0   TO   links.length - 1
                TRY
                    lcHref = links.item(i).href
                    lcText = links.item(i).innertext

                    IF   AT("下一章", lcText) > 0   OR   AT("下一页", lcText) > 0
                        * 确保不是重复的URL且属于当前章节
                        IF   AT(lcHref, this.processedUrls) = 0 AND ;
                           this.GetChapterBase(lcHref) == this.currentChapterBase
                            lcNextPage = lcHref
                            ? "找到当前章节下一页链接: " + lcNextPage
                            RETURN   lcNextPage
                        ENDIF
                    ENDIF
                CATCH
                ENDTRY
            ENDFOR
        CATCH
        ENDTRY

        * 方法2: 自动构造下一页URL
        lcNextPage = this.ConstructNextPageUrl(currentUrl)
        IF   !EMPTY(lcNextPage)   AND   AT(lcNextPage, this.processedUrls) = 0 AND ;
           this.GetChapterBase(lcNextPage) == this.currentChapterBase
            ? "通过构造找到下一页链接: " + lcNextPage
            RETURN   lcNextPage
        ENDIF

        RETURN   lcNextPage
    ENDFUNC

    * 构造下一页URL
    FUNCTION   ConstructNextPageUrl(currentUrl)
        LOCAL   lcNextUrl

        lcNextUrl = ""

        * 根据当前URL模式构造下一页
        IF   AT("_", currentUrl) > 0   AND   AT(".html", currentUrl) > 0
            * 处理类似 /6780946/3664234966_1.html, /6780946/3664234966_2.html 的模式
            LOCAL   lnPos, lcBase, lcPageNum
            lnPos = RAT("_", currentUrl)
            IF   lnPos > 0
                lcBase = LEFT(currentUrl, lnPos)
                lcPageNum = VAL(SUBSTR(currentUrl, lnPos+1))
                IF   lcPageNum > 0
                    lcNextUrl = lcBase + ALLTRIM(STR(lcPageNum + 1)) + ".html"
                ENDIF
            ENDIF
        ELSE
            * 如果当前是第一页，构造第二页
            IF   AT(".html", currentUrl) > 0
                lcNextUrl = STRTRAN(currentUrl, ".html", "_2.html")
            ENDIF
        ENDIF

        RETURN   lcNextUrl
    ENDFUNC

    * 内容清理函数
    FUNCTION   CleanContent(lcContent)
        LOCAL   lcCleaned

        lcCleaned = lcContent

        * 移除作者提示和广告文本
        lcCleaned = STRTRAN(lcCleaned, "【作者剑啸龙翔提示：如果章节内容错乱的话，关掉阅读模式即可正常】", "")
        lcCleaned = STRTRAN(lcCleaned, "UU看书", "")
        lcCleaned = STRTRAN(lcCleaned, "www., "")
        lcCleaned = STRTRAN(lcCleaned, "请记住本书首发域名", "")
        lcCleaned = STRTRAN(lcCleaned, "【首发．请记住网址（WWW.）】", "")
        lcCleaned = STRTRAN(lcCleaned, "（WWW.）", "")
        lcCleaned = STRTRAN(lcCleaned, "上一章加入书签下一章 ","")
        lcCleaned = STRTRAN(lcCleaned, "[","")
        lcCleaned = STRTRAN(lcCleaned, "]","")

        * 清理HTML实体和多余空格
        lcCleaned = STRTRAN(lcCleaned, " ", " ")
        lcCleaned = STRTRAN(lcCleaned, CHR(13) + CHR(10) + CHR(13) + CHR(10), CHR(13) + CHR(10))
        lcCleaned = STRTRAN(lcCleaned, CHR(10) + CHR(10), CHR(10))

        * 整理文本格式
        lcCleaned = STRTRAN(lcCleaned, "    ", "  ")  && 减少空格数量
        lcCleaned = ALLTRIM(lcCleaned)

        RETURN   lcCleaned
    ENDFUNC
ENDDEFINE

搜索更多相关主题的帖子: 当前　 this　内容　 RETURN　IF