怎样去除下载的小说中的广告及乱码现象
下列代码,能去除下载的小说中的广告,但会出现乱码现象。(windows xp系统、低版本IE浏览器)
如何修改代码防止下载的小说中出现乱码现象?请高手赐教,万分感谢!!!
CLEAR
CLOSE DATABASES
SET DEFAULT TO (ADDBS(JUSTPATH(SYS(16))))
DECLARE long PostMessageA IN user32 long,long,long,long
IF !FILE("羊城不相信爱情.dbf")
CREATE TABLE 羊城不相信爱情 (title C(50), addr C(254), txt M)
USE
ENDIF
USE 羊城不相信爱情 ALIAS tu
of = CREATEOBJECT("form1")
of.show(1)
CLOSE TABLES ALL
CLEAR ALL
RETURN
DEFINE CLASS form1 as Form
width = 800
height = 600
AutoCenter = .T.
AllowOutput = .f.
* 分页与章节范围控制
nCurrentPage = 1
cBaseUrl = ""
nMaxPage = 3
nStartChap = 1
nEndChap = 5
nChapCount = 0
ADD OBJECT but as commandbutton WITH left=10,top=10,width=100,height=22,caption="开始"
ADD OBJECT grd as grid WITH left=10,top=40,width=250,height=550,RecordSource="tu",AllowCellSelection=.f.
ADD OBJECT edt as editbox WITH left=280,top=40,width=510,height=550
ADD OBJECT web as Olecontrol with OleClass="Shell.Explorer.2",left=-100
PROCEDURE Destroy
UNBINDEVENTS(this.hWnd)
ENDPROC
PROCEDURE Init
this.web.Silent = .t.
BINDEVENT(this.hWnd, 0x401, this, "myMessage")
ENDPROC
PROCEDURE but.click
ZAP IN "tu"
thisform.edt.value = " "
thisform.nChapCount = 0
thisform.nCurrentPage = 1
thisform.cBaseUrl = ""
thisform.web.navigate("http://www.)
ENDPROC
PROCEDURE grd.click
thisform.edt.value = tu.txt
thisform.edt.SelStart = 0
ENDPROC
PROCEDURE web.documentComplete(pdisp, url)
IF (SYS(3095, pdisp) == SYS(3095, this))
PostMessageA(thisform.hWnd, 0x401, 0, 0)
ENDIF
ENDPROC
* 兼容低版本VFP的文本清理函数
FUNCTION RemoveAllBrackets(cRawText)
LOCAL cResult, i
LOCAL ARRAY aBrackets[10, 2]
* 定义所有可能的括号对
aBrackets[1, 1] = "【"
aBrackets[1, 2] = "】"
aBrackets[2, 1] = "["
aBrackets[2, 2] = "]"
aBrackets[3, 1] = "《"
aBrackets[3, 2] = "》"
aBrackets[4, 1] = "『"
aBrackets[4, 2] = "』"
aBrackets[5, 1] = "("
aBrackets[5, 2] = ")"
aBrackets[6, 1] = "("
aBrackets[6, 2] = ")"
aBrackets[7, 1] = "{"
aBrackets[7, 2] = "}"
aBrackets[8, 1] = "「"
aBrackets[8, 2] = "」"
aBrackets[9, 1] = "<"
aBrackets[9, 2] = ">"
aBrackets[10, 1] = "〖"
aBrackets[10, 2] = "〗"
cResult = cRawText
* 循环删除所有括号对
FOR i = 1 TO ALEN(aBrackets, 1)
cResult = thisform.RemovePairedBrackets(cResult, aBrackets[i, 1], aBrackets[i, 2])
ENDFOR
RETURN cResult
ENDFUNC
* 删除成对括号及其内容
FUNCTION RemovePairedBrackets(cRawText, cStart, cEnd)
LOCAL cResult, nStart, nEnd, nLastEnd, nOccur
cResult = ""
nLastEnd = 1
nStart = AT(cStart, cRawText)
DO WHILE nStart > 0
* 计算从当前位置开始的子串,然后查找cEnd
cSubString = SUBSTR(cRawText, nStart + 1)
nEndInSub = AT(cEnd, cSubString)
IF nEndInSub > 0
nEnd = nStart + nEndInSub && 转换回原字符串中的位置
* 删除这个括号对
cResult = cResult + SUBSTR(cRawText, nLastEnd, nStart - nLastEnd)
nLastEnd = nEnd + 1
* 查找下一个cStart
nStart = AT(cStart, SUBSTR(cRawText, nLastEnd))
IF nStart > 0
nStart = nStart + nLastEnd - 1
ENDIF
ELSE
* 括号未闭合,跳出
EXIT
ENDIF
ENDDO
* 添加剩余部分
IF nLastEnd <= LEN(cRawText)
cResult = cResult + SUBSTR(cRawText, nLastEnd)
ENDIF
* 合并连续空行、去除首尾无效换行
cResult = STRTRAN(cResult , CHR(13)+CHR(10)+CHR(13)+CHR(10), CHR(13)+CHR(10))
cResult = STRTRAN(cResult , CHR(13)+CHR(10)+CHR(13)+CHR(10), CHR(13)+CHR(10))
cResult = LTRIM(RTRIM(cResult , CHR(13)+CHR(10)), CHR(13)+CHR(10))
RETURN cResult
ENDFUNC
FUNCTION myMessage(hWnd, uMsg, wParam, lParam)
IF VARTYPE(thisform.web) != "O"
RETURN
ENDIF
dom = thisform.web.document
IF VARTYPE(dom) != "O"
thisform.but.Enabled = .t.
RETURN
ENDIF
* 第一步:筛选指定章节列表
IF EMPTY(thisform.cBaseUrl)
lis = dom.getElementsByTagName("li")
IF VARTYPE(lis) = "O"
FOR EACH li IN lis
IF li.classname == "line3"
thisform.nChapCount = thisform.nChapCount + 1
IF thisform.nChapCount >= thisform.nStartChap+6 AND thisform.nChapCount <= thisform.nEndChap+6
oA = li.getElementsByTagName("a").item(0)
IF VARTYPE(oA) = "O"
INSERT INTO tu VALUES (;
ALLTRIM(oA.innertext),;
ALLTRIM(oA.href), "";
)
ENDIF
ENDIF
ENDIF
ENDFOR
ENDIF
IF RECCOUNT("tu") = 0
MESSAGEBOX("未获取到第"+ALLTRIM(STR(thisform.nStartChap))+"-"+ALLTRIM(STR(thisform.nEndChap))+"章数据!", 0, "提示")
thisform.but.Enabled = .t.
RETURN
ENDIF
GO TOP IN "tu"
thisform.grd.setfocus
thisform.cBaseUrl = LEFT(ALLTRIM(tu.addr), AT(".html", ALLTRIM(tu.addr)) - 1)
thisform.nCurrentPage = 1
thisform.web.navigate(ALLTRIM(tu.addr))
RETURN
ENDIF
* 第二步:爬取并清理当前页内容
oChapter = dom.getElementById("chapter")
IF !ISNULL(oChapter)
cRawText = oChapter.innertext
* 调用清理函数处理文本
cCleanText = thisform.RemoveAllBrackets(cRawText)
* 拼接章节内容
IF thisform.nCurrentPage == 1
cFullText = ALLTRIM(tu.title) + CHR(13)+CHR(10)+CHR(13)+CHR(10) + cCleanText
ELSE
cFullText = tu.txt + CHR(13)+CHR(10) + cCleanText
ENDIF
REPLACE tu.txt WITH cFullText
? "已爬取【" + ALLTRIM(tu.title) + "】第" + STR(thisform.nCurrentPage, 1) + "页(已清理广告/空行)"
ENDIF
* 第三步:分页逻辑控制
thisform.nCurrentPage = thisform.nCurrentPage + 1
IF thisform.nCurrentPage <= thisform.nMaxPage AND !ISNULL(oChapter)
cNextPageUrl = thisform.cBaseUrl + "_" + STR(thisform.nCurrentPage, 1) + ".html"
thisform.web.navigate(cNextPageUrl)
ELSE
thisform.nCurrentPage = 1
SKIP IN "tu"
IF !EOF("tu")
thisform.cBaseUrl = LEFT(ALLTRIM(tu.addr), AT(".html", ALLTRIM(tu.addr)) - 1)
thisform.web.navigate(ALLTRIM(tu.addr))
ELSE
GO TOP IN "tu"
thisform.grd.setfocus
thisform.but.Enabled = .t.
MESSAGEBOX("第"+ALLTRIM(STR(thisform.nStartChap))+"-"+ALLTRIM(STR(thisform.nEndChap))+"章已爬取完成!", 0, "提示")
ENDIF
ENDIF
ENDFUNC
ENDDEFINE
[此贴子已经被作者于2025-12-8 21:23编辑过]






