怎样去除爬取小说中的广告内容
下列程序能正常爬取指定章节的小说,但爬下来的小说中广告内容太多,靠手工去除实在麻烦,请教各位高手,有没有用vfp代码去除的方法,望赐教,谢谢!* 爬取小说《羊城不相信爱情》
CLEAR
CLOSE DATABASES
SET DEFAULT TO (ADDBS(JUSTPATH(SYS(16))))
DECLARE long PostMessageA IN user32 long,long,long,long
IF !FILE("羊城不相信爱情.dbf")
CREATE TABLE 羊城不相信爱情 (title C(50), addr C(254), txt M)
USE
ENDIF
USE 羊城不相信爱情 ALIAS tu
of = CREATEOBJECT("form1")
of.show(1)
CLOSE TABLES ALL
CLEAR ALL
RETURN
DEFINE CLASS form1 as Form
width = 800
height = 600
AutoCenter = .T.
AllowOutput = .f.
* 分页与章节范围控制
nCurrentPage = 1
cBaseUrl = ""
nMaxPage = 3
nStartChap = 20
nEndChap = 25
nChapCount = 0
ADD OBJECT but as commandbutton WITH left=10,top=10,width=100,height=22,caption="开始"
ADD OBJECT grd as grid WITH left=10,top=40,width=250,height=550,RecordSource="tu",AllowCellSelection=.f.
ADD OBJECT edt as editbox WITH left=280,top=40,width=510,height=550
ADD OBJECT web as Olecontrol with OleClass="Shell.Explorer.2",left=-100
PROCEDURE Destroy
UNBINDEVENTS(this.hWnd)
ENDPROC
PROCEDURE Init
this.web.Silent = .t.
BINDEVENT(this.hWnd, 0x401, this, "myMessage")
ENDPROC
PROCEDURE but.click
ZAP IN "tu"
thisform.edt.value = " "
thisform.nChapCount = 0
thisform.nCurrentPage = 1
thisform.cBaseUrl = ""
thisform.web.navigate("http://www.)
ENDPROC
PROCEDURE grd.click
thisform.edt.value = tu.txt
thisform.edt.SelStart = 0
ENDPROC
PROCEDURE web.documentComplete(pdisp, url)
IF (SYS(3095, pdisp) == SYS(3095, this))
PostMessageA(thisform.hWnd, 0x401, 0, 0)
ENDIF
ENDPROC
* 修复编码乱码的工具函数
FUNCTION FixEncoding(cText)
LOCAL cResult
* 处理低版本IE GB2312与VFP内部编码不兼容问题
cResult = STRCONV(STRCONV(cText, 5), 6)
* 移除不可见控制字符
cResult = STRTRAN(cResult, CHR(0), "")
cResult = STRTRAN(cResult, CHR(7), "")
cResult = STRTRAN(cResult, CHR(12), "")
RETURN cResult
ENDFUNC
* 批量清理复杂广告文本(支持扩展广告特征)
FUNCTION BatchCleanAd(cRawText)
LOCAL cResult, i, aAdList[1]
cResult = cRawText
* 定义广告特征列表(可按需扩展)
DIMENSION aAdList[32]
aAdList[1] = "【作者美好遇见提示:如果章节内容错乱的话,关掉阅读模式即可正常】"
aAdList[2] = "[特种兵军旅小说:青衣文学网]"
aAdList[3] = "【好书推荐站:恍惚文学网】"
aAdList[4] = "『全网热议小说:博库文学网』"
aAdList[5] = "【言情小说精选:文启书库】"
aAdList[6] = "【在线阅读精选:三顾书屋】"
aAdList[7] = "(大秦帝国传:众阅阁)"
aAdList[8] = "《超甜宠文推荐:归云文学网》"
aAdList[9] = "【夜读精选:孤灯阁】"
aAdList[10] = "玄幻爽文精选:紫翠文学网"
aAdList[11] = "【书友推荐榜:紫翠文学网】"
aAdList[12] = "【小说迷最爱:暖冬阁】"
aAdList[13] = "《霸道总裁言情:智博文学网》"
aAdList[14] = "【最新完结小说:拾忆文学网】"
aAdList[15] = "【都市言情精选:芳泽小说网】"
aAdList[16] = "【优质长篇小说:蓝月小说网】"
aAdList[17] = "(精选完本小说:淡陌文学网)"
aAdList[18] = "【文学爱好者天堂:爱好文学】"
aAdList[19] = "【高口碑文学:众阅阁】"
aAdList[20] = "【浪漫言情站点:紫翠轩】"
aAdList[21] = "(都市热血必读:智博文学网)"
aAdList[22] = "《值得一看的文学佳作:碧凡小说网》"
aAdList[23] = "【畅销书推荐:创世阁】"
aAdList[24] = "『时空穿越奇遇:以山文学网』"
aAdList[25] = "【超甜宠文推荐:雅轩书屋】"
aAdList[26] = "【高评分小说合集:书易小说网】"
aAdList[27] = "《阅读爱好者精选:梦轩小说》"
aAdList[28] = "【精选推理小说:高雅文学网】"
aAdList[29] = "【女生最爱小说:轻语书屋】"
aAdList[30] = "【书迷的最爱:半味书屋】"
aAdList[31] = "["
aAdList[32] = "]"
* 批量替换广告文本
FOR i = 1 TO ALEN(aAdList)
IF !EMPTY(aAdList[i])
cResult = STRTRAN(cResult, aAdList[i], "")
ENDIF
ENDFOR
* 补充清理通用广告格式(文学网/小说网类广告)
cResult = thisform.CleanGeneralAd(cResult)
* 合并连续空行、去除首尾无效换行
cResult = STRTRAN(cResult, CHR(13)+CHR(10)+CHR(13)+CHR(10), CHR(13)+CHR(10))
cResult = STRTRAN(cResult, CHR(13)+CHR(10)+CHR(13)+CHR(10), CHR(13)+CHR(10))
cResult = LTRIM(RTRIM(cResult, CHR(13)+CHR(10)), CHR(13)+CHR(10))
RETURN cResult
ENDFUNC
* 清理通用格式广告(适配新增未知广告)
FUNCTION CleanGeneralAd(cText)
LOCAL cResult, nStart, nEnd, cMarkers
cResult = cText
cMarkers = "【】[]()《》『』"
* 清理包含"文学网""小说网""书屋""阁"的括号广告
FOR i = 1 TO LEN(cMarkers)
LOCAL cStart, cEnd
cStart = SUBSTR(cMarkers, i, 1)
* 匹配对应闭合符号
DO CASE
CASE cStart = "【"
cEnd = "】"
CASE cStart = "["
cEnd = "]"
CASE cStart = "("
cEnd = ")"
CASE cStart = "《"
cEnd = "》"
CASE cStart = "『"
cEnd = "』"
OTHERWISE
LOOP
ENDCASE
* 循环删除符合特征的广告
DO WHILE AT(cStart, cResult) > 0
nStart = AT(cStart, cResult)
nEnd = AT(cEnd, cResult, nStart)
IF nEnd > nStart
LOCAL cSub
cSub = SUBSTR(cResult, nStart + 1, nEnd - nStart - 1)
* 判断是否为广告(包含指定关键词则删除)
IF "文学网" $ cSub OR "小说网" $ cSub OR "书屋" $ cSub OR "阁" $ cSub
cResult = STUFF(cResult, nStart, nEnd - nStart + 1, "")
ELSE
EXIT
ENDIF
ELSE
EXIT
ENDIF
ENDDO
ENDFOR
RETURN cResult
ENDFUNC
FUNCTION myMessage(hWnd, uMsg, wParam, lParam)
IF VARTYPE(thisform.web) != "O"
RETURN
ENDIF
dom = thisform.web.document
IF VARTYPE(dom) != "O"
thisform.but.Enabled = .t.
RETURN
ENDIF
* 第一步:筛选指定章节列表
IF EMPTY(thisform.cBaseUrl)
lis = dom.getElementsByTagName("li")
IF VARTYPE(lis) = "O"
FOR EACH li IN lis
IF li.classname == "line3"
thisform.nChapCount = thisform.nChapCount + 1
IF thisform.nChapCount >= thisform.nStartChap+6 AND thisform.nChapCount <= thisform.nEndChap+6
oA = li.getElementsByTagName("a").item(0)
IF VARTYPE(oA) = "O"
INSERT INTO tu VALUES (;
ALLTRIM(oA.innertext),;
ALLTRIM(oA.href), "";
)
ENDIF
ENDIF
ENDIF
ENDFOR
ENDIF
IF RECCOUNT("tu") = 0
MESSAGEBOX("未获取到第"+ALLTRIM(STR(thisform.nStartChap))+"-"+ALLTRIM(STR(thisform.nEndChap))+"章数据!", 0, "提示")
thisform.but.Enabled = .t.
RETURN
ENDIF
GO TOP IN "tu"
thisform.grd.setfocus
thisform.cBaseUrl = LEFT(ALLTRIM(tu.addr), AT(".html", ALLTRIM(tu.addr)) - 1)
thisform.nCurrentPage = 1
thisform.web.navigate(ALLTRIM(tu.addr))
RETURN
ENDIF
* 第二步:爬取并清理当前页内容(新增批量广告清理)
oChapter = dom.getElementById("chapter")
IF !ISNULL(oChapter)
cRawText = oChapter.innertext
* 先修复编码,再批量清理广告
cRawText = thisform.FixEncoding(cRawText)
cCleanText = thisform.BatchCleanAd(cRawText)
* 拼接章节内容
IF thisform.nCurrentPage == 1
cFullText = ALLTRIM(tu.title) + CHR(13)+CHR(10)+CHR(13)+CHR(10) + cCleanText
ELSE
cFullText = tu.txt + CHR(13)+CHR(10) + cCleanText
ENDIF
REPLACE tu.txt WITH cFullText
? "已爬取【" + ALLTRIM(tu.title) + "】第" + STR(thisform.nCurrentPage, 1) + "页(已清理复杂广告/修复乱码)"
ENDIF
* 第三步:分页逻辑控制
thisform.nCurrentPage = thisform.nCurrentPage + 1
IF thisform.nCurrentPage <= thisform.nMaxPage AND !ISNULL(oChapter)
cNextPageUrl = thisform.cBaseUrl + "_" + STR(thisform.nCurrentPage, 1) + ".html"
thisform.web.navigate(cNextPageUrl)
ELSE
thisform.nCurrentPage = 1
SKIP IN "tu"
IF !EOF("tu")
thisform.cBaseUrl = LEFT(ALLTRIM(tu.addr), AT(".html", ALLTRIM(tu.addr)) - 1)
thisform.web.navigate(ALLTRIM(tu.addr))
ELSE
GO TOP IN "tu"
thisform.grd.setfocus
thisform.but.Enabled = .t.
MESSAGEBOX("第"+ALLTRIM(STR(thisform.nStartChap))+"-"+ALLTRIM(STR(thisform.nEndChap))+"章已爬取完成!", 0, "提示")
ENDIF
ENDIF
ENDFUNC
ENDDEFINE






