怎样去除下载的小说中的广告及乱码现象 - VFP论坛

问题点数：20 回复次数：0

怎样去除下载的小说中的广告及乱码现象

下列代码，能去除下载的小说中的广告，但会出现乱码现象。
（windows xp系统、低版本IE浏览器）
如何修改代码防止下载的小说中出现乱码现象？请高手赐教，万分感谢！！！

CLEAR
CLOSE DATABASES
SET DEFAULT TO (ADDBS(JUSTPATH(SYS(16))))
DECLARE long PostMessageA IN user32 long,long,long,long
IF !FILE("羊城不相信爱情.dbf")
    CREATE TABLE 羊城不相信爱情 (title C(50), addr C(254), txt M)
    USE
ENDIF
USE 羊城不相信爱情 ALIAS tu
of = CREATEOBJECT("form1")
of.show(1)
CLOSE TABLES ALL
CLEAR ALL
RETURN

DEFINE CLASS form1 as Form
    width = 800
    height = 600
    AutoCenter = .T.
    AllowOutput = .f.
    * 分页与章节范围控制
    nCurrentPage = 1
    cBaseUrl = ""
    nMaxPage = 3
    nStartChap = 1
    nEndChap = 5
    nChapCount = 0

    ADD OBJECT but as commandbutton WITH left=10,top=10,width=100,height=22,caption="开始"
    ADD OBJECT grd as grid WITH left=10,top=40,width=250,height=550,RecordSource="tu",AllowCellSelection=.f.
    ADD OBJECT edt as editbox WITH left=280,top=40,width=510,height=550
    ADD OBJECT web as Olecontrol with OleClass="Shell.Explorer.2",left=-100

    PROCEDURE Destroy
        UNBINDEVENTS(this.hWnd)
    ENDPROC

    PROCEDURE Init
        this.web.Silent = .t.
        BINDEVENT(this.hWnd, 0x401, this, "myMessage")
    ENDPROC

    PROCEDURE but.click
        ZAP IN "tu"
        thisform.edt.value = " "
        thisform.nChapCount = 0
        thisform.nCurrentPage = 1
        thisform.cBaseUrl = ""
        thisform.web.navigate("http://www.)
    ENDPROC

    PROCEDURE grd.click
        thisform.edt.value = tu.txt
        thisform.edt.SelStart = 0
    ENDPROC

    PROCEDURE web.documentComplete(pdisp, url)
        IF (SYS(3095, pdisp) == SYS(3095, this))
            PostMessageA(thisform.hWnd, 0x401, 0, 0)
        ENDIF
    ENDPROC

    * 兼容低版本VFP的文本清理函数
    FUNCTION RemoveAllBrackets(cRawText)
        LOCAL cResult, i
        LOCAL ARRAY aBrackets[10, 2]

        * 定义所有可能的括号对
        aBrackets[1, 1] = "【"
        aBrackets[1, 2] = "】"
        aBrackets[2, 1] = "["
        aBrackets[2, 2] = "]"
        aBrackets[3, 1] = "《"
        aBrackets[3, 2] = "》"
        aBrackets[4, 1] = "『"
        aBrackets[4, 2] = "』"
        aBrackets[5, 1] = "("
        aBrackets[5, 2] = ")"
        aBrackets[6, 1] = "（"
        aBrackets[6, 2] = "）"
        aBrackets[7, 1] = "{"
        aBrackets[7, 2] = "}"
        aBrackets[8, 1] = "「"
        aBrackets[8, 2] = "」"
        aBrackets[9, 1] = "<"
        aBrackets[9, 2] = ">"
        aBrackets[10, 1] = "〖"
        aBrackets[10, 2] = "〗"

        cResult = cRawText

        * 循环删除所有括号对
        FOR i = 1 TO ALEN(aBrackets, 1)
            cResult = thisform.RemovePairedBrackets(cResult, aBrackets[i, 1], aBrackets[i, 2])
        ENDFOR

        RETURN cResult
    ENDFUNC

    * 删除成对括号及其内容
   FUNCTION RemovePairedBrackets(cRawText, cStart, cEnd)
    LOCAL cResult, nStart, nEnd, nLastEnd, nOccur

    cResult = ""
    nLastEnd = 1
    nStart = AT(cStart, cRawText)

    DO WHILE nStart > 0
        * 计算从当前位置开始的子串，然后查找cEnd
        cSubString = SUBSTR(cRawText, nStart + 1)
        nEndInSub = AT(cEnd, cSubString)

        IF nEndInSub > 0
            nEnd = nStart + nEndInSub  && 转换回原字符串中的位置

            * 删除这个括号对
            cResult = cResult + SUBSTR(cRawText, nLastEnd, nStart - nLastEnd)
            nLastEnd = nEnd + 1

            * 查找下一个cStart
            nStart = AT(cStart, SUBSTR(cRawText, nLastEnd))
            IF nStart > 0
                nStart = nStart + nLastEnd - 1
            ENDIF
        ELSE
            * 括号未闭合，跳出
            EXIT
        ENDIF
    ENDDO

    * 添加剩余部分
    IF nLastEnd <= LEN(cRawText)
        cResult = cResult + SUBSTR(cRawText, nLastEnd)
    ENDIF

    * 合并连续空行、去除首尾无效换行
    cResult  = STRTRAN(cResult , CHR(13)+CHR(10)+CHR(13)+CHR(10), CHR(13)+CHR(10))
    cResult  = STRTRAN(cResult , CHR(13)+CHR(10)+CHR(13)+CHR(10), CHR(13)+CHR(10))
    cResult = LTRIM(RTRIM(cResult , CHR(13)+CHR(10)), CHR(13)+CHR(10))

    RETURN cResult
ENDFUNC

    FUNCTION myMessage(hWnd, uMsg, wParam, lParam)
        IF VARTYPE(thisform.web) != "O"
            RETURN
        ENDIF
        dom = thisform.web.document
        IF VARTYPE(dom) != "O"
            thisform.but.Enabled = .t.
            RETURN
        ENDIF

        * 第一步：筛选指定章节列表
        IF EMPTY(thisform.cBaseUrl)
            lis = dom.getElementsByTagName("li")
            IF VARTYPE(lis) = "O"
                FOR EACH li IN lis
                    IF li.classname == "line3"
                        thisform.nChapCount = thisform.nChapCount + 1
                        IF thisform.nChapCount >= thisform.nStartChap+6 AND thisform.nChapCount <= thisform.nEndChap+6
                            oA = li.getElementsByTagName("a").item(0)
                            IF VARTYPE(oA) = "O"
                                INSERT INTO tu VALUES (;
                                    ALLTRIM(oA.innertext),;
                                    ALLTRIM(oA.href), "";
                                )
                            ENDIF
                        ENDIF
                    ENDIF
                ENDFOR
            ENDIF
            IF RECCOUNT("tu") = 0
                MESSAGEBOX("未获取到第"+ALLTRIM(STR(thisform.nStartChap))+"-"+ALLTRIM(STR(thisform.nEndChap))+"章数据！", 0, "提示")
                thisform.but.Enabled = .t.
                RETURN
            ENDIF
            GO TOP IN "tu"
            thisform.grd.setfocus
            thisform.cBaseUrl = LEFT(ALLTRIM(tu.addr), AT(".html", ALLTRIM(tu.addr)) - 1)
            thisform.nCurrentPage = 1
            thisform.web.navigate(ALLTRIM(tu.addr))
            RETURN
        ENDIF

        * 第二步：爬取并清理当前页内容
        oChapter = dom.getElementById("chapter")
        IF !ISNULL(oChapter)
            cRawText = oChapter.innertext
            * 调用清理函数处理文本
            cCleanText = thisform.RemoveAllBrackets(cRawText)
            * 拼接章节内容
            IF thisform.nCurrentPage == 1
                cFullText = ALLTRIM(tu.title) + CHR(13)+CHR(10)+CHR(13)+CHR(10) + cCleanText
            ELSE
                cFullText = tu.txt + CHR(13)+CHR(10) + cCleanText
            ENDIF
            REPLACE tu.txt WITH cFullText
            ? "已爬取【" + ALLTRIM(tu.title) + "】第" + STR(thisform.nCurrentPage, 1) + "页（已清理广告/空行）"
        ENDIF

        * 第三步：分页逻辑控制
        thisform.nCurrentPage = thisform.nCurrentPage + 1
        IF thisform.nCurrentPage <= thisform.nMaxPage AND !ISNULL(oChapter)
            cNextPageUrl = thisform.cBaseUrl + "_" + STR(thisform.nCurrentPage, 1) + ".html"
            thisform.web.navigate(cNextPageUrl)
        ELSE
            thisform.nCurrentPage = 1
            SKIP IN "tu"
            IF !EOF("tu")
                thisform.cBaseUrl = LEFT(ALLTRIM(tu.addr), AT(".html", ALLTRIM(tu.addr)) - 1)
                thisform.web.navigate(ALLTRIM(tu.addr))
            ELSE
                GO TOP IN "tu"
                thisform.grd.setfocus
                thisform.but.Enabled = .t.
                MESSAGEBOX("第"+ALLTRIM(STR(thisform.nStartChap))+"-"+ALLTRIM(STR(thisform.nEndChap))+"章已爬取完成！", 0, "提示")
            ENDIF
        ENDIF
    ENDFUNC
ENDDEFINE

[此贴子已经被作者于2025-12-8 21:23编辑过]

搜索更多相关主题的帖子: addr　thisform　IF　web　RETURN