下列代码有点长,如何精简?
下列代码能将《仙道九绝》指定章节小说完整地爬下来,但我总觉得代码有点长,不知能否精简(windows xp系统,低版本IE浏览器),现贴上代码,请各位高人指点,谢谢!【VFP代码】如下:
CLEAR
CLOSE DATABASES
SET DEFAULT TO (ADDBS(JUSTPATH(SYS(16))))
DECLARE long PostMessageA IN user32 long, long, long, long
IF !FILE("仙道九绝.dbf")
CREATE TABLE 仙道九绝 (title C(100), addr C(254), txt M)
USE
ENDIF
USE 仙道九绝 ALIAS tu
of = CREATEOBJECT("form1")
of.show(1)
CLOSE TABLES ALL
CLEAR ALL
RETURN
DEFINE CLASS form1 as Form
width = 800
height = 600
AutoCenter = .T.
AllowOutput = .f.
ADD OBJECT but as commandbutton WITH left=10, top=10, width=100, height=22, caption="开始(第7-11章)"
ADD OBJECT grd as grid WITH left=10, top=40, width=250, height=550, RecordSource="tu", AllowCellSelection=.f.
ADD OBJECT edt as editbox WITH left=280, top=40, width=510, height=550
ADD OBJECT web as Olecontrol with OleClass="Shell.Explorer.2", left=-100
* 分页处理属性
currentChapterContent = "" && 存储当前章节的所有内容
currentPageNumber = 1 && 当前页码
maxPages = 4 && 最大尝试页数
isProcessingChapter = .F. && 标记是否正在处理章节
startChapter = 7 && 开始章节
endChapter = 11 && 结束章节
originalChapterUrl = "" && 存储章节原始URL用于分页检测
currentChapterBase = "" && 当前章节的基础URL(不含页码)
processedUrls = "" && 存储已处理的URL避免重复
currentChapterTitle = "" && 当前章节标题
PROCEDURE Destroy
UNBINDEVENTS(this.hWnd)
ENDPROC
PROCEDURE Init
this.web.Silent = .t.
BINDEVENT(this.hWnd, 0x401, this, "myMessage")
ENDPROC
PROCEDURE but.click
ZAP IN "tu"
thisform.edt.value = ""
thisform.currentChapterContent = ""
thisform.currentPageNumber = 1
thisform.isProcessingChapter = .F.
thisform.processedUrls = ""
thisform.currentChapterBase = ""
thisform.currentChapterTitle = ""
thisform.web.navigate("http://www.)
ENDPROC
PROCEDURE grd.click
thisform.edt.value = tu.txt
thisform.edt.SelStart = 0
ENDPROC
PROCEDURE web.documentComplete(pdisp, url)
IF (SYS(3095, pdisp) == SYS(3095, this))
PostMessageA(thisform.hWnd, 0x401, 0, 0)
ENDIF
ENDPROC
FUNCTION myMessage(hWnd, uMsg, wParam, lParam)
dom = this.web.document
IF VARTYPE(dom) != "O"
this.Enabled = .t.
RETURN
ENDIF
* 添加延迟,确保页面完全加载
INKEY(1.5)
IF EMPTY(tu.addr)
* 获取章节列表
lis = dom.getElementsByTagName("li")
LOCAL nCount, chapterCount
nCount = 0
chapterCount = 0
FOR EACH li IN lis
IF li.classname == "line3"
nCount = nCount + 1
* 只获取第7章到第11章
IF nCount >= this.startChapter AND nCount <= this.endChapter
chapterCount = chapterCount + 1
INSERT INTO tu (title, addr, txt) VALUES (;
li.getElementsByTagName("a").item[0].innertext,;
li.getElementsByTagName("a").item[0].href, "";
)
? "添加章节: " + li.getElementsByTagName("a").item[0].innertext
ENDIF
* 如果已经超过结束章节,退出循环
IF nCount > this.endChapter
EXIT
ENDIF
ENDIF
ENDFOR
IF RECCOUNT("tu") = 0
MESSAGEBOX("未找到第" + ALLTRIM(STR(this.startChapter)) + ;
"章至第" + ALLTRIM(STR(this.endChapter)) + "章的内容!")
this.Enabled = .t.
RETURN
ENDIF
GO TOP IN "tu"
this.grd.setfocus
this.currentChapterContent = ""
this.currentPageNumber = 1
this.isProcessingChapter = .T.
this.originalChapterUrl = ALLTRIM(tu.addr)
this.currentChapterBase = this.GetChapterBase(this.originalChapterUrl)
this.currentChapterTitle = ALLTRIM(tu.title)
this.processedUrls = this.originalChapterUrl + "|"
this.web.navigate(this.originalChapterUrl)
RETURN
ENDIF
* 爬取当前章节内容
IF !EOF("tu") AND this.isProcessingChapter
LOCAL lcContent, lcNextPage, lcCurrentUrl
lcContent = ""
lcNextPage = ""
lcCurrentUrl = this.web.LocationURL
* 检查当前URL是否属于当前章节
IF this.currentChapterBase != this.GetChapterBase(lcCurrentUrl)
? "警告: 检测到URL跳转到其他章节,停止当前章节爬取"
* 直接保存当前章节内容并移动到下一章
REPLACE tu.txt WITH this.currentChapterContent
? "章节 '" + this.currentChapterTitle + "' 获取完成,共 " + ;
ALLTRIM(STR(this.currentPageNumber)) + " 页"
* 重置分页状态
this.currentChapterContent = ""
this.currentPageNumber = 1
this.processedUrls = ""
* 移动到下一章
SKIP IN "tu"
IF !EOF("tu")
this.grd.setfocus
this.originalChapterUrl = ALLTRIM(tu.addr)
this.currentChapterBase = this.GetChapterBase(this.originalChapterUrl)
this.currentChapterTitle = ALLTRIM(tu.title)
this.processedUrls = this.originalChapterUrl + "|"
this.web.navigate(this.originalChapterUrl)
RETURN
ELSE
this.isProcessingChapter = .F.
GO TOP IN "tu"
this.grd.setfocus
this.Enabled = .t.
MESSAGEBOX("第" + ALLTRIM(STR(this.startChapter)) + "章至第" + ;
ALLTRIM(STR(this.endChapter)) + "章内容爬取完成!")
ENDIF
RETURN
ENDIF
* 根据HTML结构,从id="chapter"的div中获取内容
lcContent = this.ExtractChapterContent(dom)
IF EMPTY(lcContent)
? "警告: 无法提取章节内容,尝试备用方法..."
lcContent = this.ExtractContentAlternative(dom)
ENDIF
* 清理内容
lcContent = this.CleanContent(lcContent)
* 将当前页内容追加到章节总内容中
IF this.currentPageNumber == 1
this.currentChapterContent = this.currentChapterTitle + CHR(13) + CHR(10) + ;
CHR(13) + CHR(10) + lcContent
ELSE
this.currentChapterContent = this.currentChapterContent + CHR(13) + ;
CHR(10) + lcContent
ENDIF
* 查找下一页链接
lcNextPage = this.FindNextPageLink(dom, lcCurrentUrl)
* 如果有下一页且未超过最大页数限制,继续获取下一页
IF !EMPTY(lcNextPage) AND this.currentPageNumber < this.maxPages
this.currentPageNumber = this.currentPageNumber + 1
this.processedUrls = this.processedUrls + lcNextPage + "|"
this.web.navigate(lcNextPage)
? "章节 '" + this.currentChapterTitle + "' 第 " + ;
ALLTRIM(STR(this.currentPageNumber-1)) + " 页获取完成,继续获取下一页..."
RETURN
ELSE
* 没有下一页或达到最大页数,保存完整章节内容
REPLACE tu.txt WITH this.currentChapterContent
? "章节 '" + this.currentChapterTitle + "' 获取完成,共 " + ;
ALLTRIM(STR(this.currentPageNumber)) + " 页"
* 重置分页状态
this.currentChapterContent = ""
this.currentPageNumber = 1
this.processedUrls = ""
* 移动到下一章
SKIP IN "tu"
IF !EOF("tu")
this.grd.setfocus
this.originalChapterUrl = ALLTRIM(tu.addr)
this.currentChapterBase = this.GetChapterBase(this.originalChapterUrl)
this.currentChapterTitle = ALLTRIM(tu.title)
this.processedUrls = this.originalChapterUrl + "|"
this.web.navigate(this.originalChapterUrl)
RETURN
ELSE
this.isProcessingChapter = .F.
GO TOP IN "tu"
this.grd.setfocus
this.Enabled = .t.
MESSAGEBOX("第" + ALLTRIM(STR(this.startChapter)) + "章至第" + ;
ALLTRIM(STR(this.endChapter)) + "章内容爬取完成!")
ENDIF
ENDIF
ENDIF
ENDFUNC
* 获取章节基础URL(去除页码部分)
FUNCTION GetChapterBase(url)
LOCAL lcBase
lcBase = LOWER(ALLTRIM(url))
* 移除页码部分,获取章节基础URL
* 处理模式:/6780946/3664234966.html 和 /6780946/3664234966_2.html
IF AT("_", lcBase) > 0 AND AT(".html", lcBase) > 0
* 找到最后一个下划线位置
LOCAL lnPos
lnPos = RAT("_", lcBase)
IF lnPos > 0
lcBase = LEFT(lcBase, lnPos-1) + ".html"
ENDIF
ENDIF
RETURN lcBase
ENDFUNC
* 内容提取函数 - 根据HTML结构修正
FUNCTION ExtractChapterContent(dom)
LOCAL lcContent
lcContent = ""
* 3种方法任选一种
* 方法1: 从id="chapter"的div中获取内容
TRY
lcContent = dom.getElementById("chapter").innertext
IF !EMPTY(lcContent)
? "通过ID 'chapter' 获取内容成功"
RETURN lcContent
ENDIF
CATCH
ENDTRY
* 方法2: 如果上面失败,尝试从id="content"的div中获取
TRY
lcContent = dom.getElementById("content").innertext
IF !EMPTY(lcContent)
? "通过ID 'content' 获取内容成功"
RETURN lcContent
ENDIF
CATCH
ENDTRY
* 方法3: 通过类名获取
TRY
divs = dom.getElementsByTagName("div")
FOR i = 0 TO divs.length - 1
TRY
IF divs.item(i).className == "chapter" OR ;
AT("chapter", LOWER(divs.item(i).className)) > 0
lcContent = divs.item(i).innertext
IF !EMPTY(lcContent)
? "通过类名 '" + divs.item(i).className + "' 获取内容成功"
RETURN lcContent
ENDIF
ENDIF
CATCH
ENDTRY
ENDFOR
CATCH
ENDTRY
RETURN lcContent
ENDFUNC
* 备用内容提取方法
FUNCTION ExtractContentAlternative(dom)
LOCAL lcContent, i, elements, lcElementText, lcBestContent
LOCAL lnMaxLength, lnCurrentLength
lcContent = ""
lcBestContent = ""
lnMaxLength = 0
* 通过文本分析找到最可能是小说内容的元素
TRY
elements = dom.getElementsByTagName("div")
FOR i = 0 TO elements.length - 1
TRY
lcElementText = elements.item(i).innertext
lnCurrentLength = LEN(ALLTRIM(lcElementText))
* 如果这个元素的文本长度超过当前最大值,并且看起来像是正文
IF lnCurrentLength > lnMaxLength AND ;
lnCurrentLength > 100 AND ;
AT("上一章", lcElementText) = 0 AND ;
AT("下一章", lcElementText) = 0 AND ;
AT("目录", lcElementText) = 0 AND ;
AT("首页", lcElementText) = 0 AND ;
AT("友情链接", lcElementText) = 0
lnMaxLength = lnCurrentLength
lcBestContent = lcElementText
ENDIF
CATCH
ENDTRY
ENDFOR
IF !EMPTY(lcBestContent)
? "通过文本分析获取内容成功,长度: " + ALLTRIM(STR(lnMaxLength))
RETURN lcBestContent
ENDIF
CATCH
ENDTRY
RETURN lcContent
ENDFUNC
* 查找下一页链接 - 改进版本,确保只找到当前章节的下一页
FUNCTION FindNextPageLink(dom, currentUrl)
LOCAL lcNextPage, i, links, lcHref, lcText
lcNextPage = ""
* 方法1: 查找包含"下一章"文本的链接,但需要验证是否属于当前章节
TRY
links = dom.getElementsByTagName("a")
FOR i = 0 TO links.length - 1
TRY
lcHref = links.item(i).href
lcText = links.item(i).innertext
IF AT("下一章", lcText) > 0 OR AT("下一页", lcText) > 0
* 确保不是重复的URL且属于当前章节
IF AT(lcHref, this.processedUrls) = 0 AND ;
this.GetChapterBase(lcHref) == this.currentChapterBase
lcNextPage = lcHref
? "找到当前章节下一页链接: " + lcNextPage
RETURN lcNextPage
ENDIF
ENDIF
CATCH
ENDTRY
ENDFOR
CATCH
ENDTRY
* 方法2: 自动构造下一页URL
lcNextPage = this.ConstructNextPageUrl(currentUrl)
IF !EMPTY(lcNextPage) AND AT(lcNextPage, this.processedUrls) = 0 AND ;
this.GetChapterBase(lcNextPage) == this.currentChapterBase
? "通过构造找到下一页链接: " + lcNextPage
RETURN lcNextPage
ENDIF
RETURN lcNextPage
ENDFUNC
* 构造下一页URL
FUNCTION ConstructNextPageUrl(currentUrl)
LOCAL lcNextUrl
lcNextUrl = ""
* 根据当前URL模式构造下一页
IF AT("_", currentUrl) > 0 AND AT(".html", currentUrl) > 0
* 处理类似 /6780946/3664234966_1.html, /6780946/3664234966_2.html 的模式
LOCAL lnPos, lcBase, lcPageNum
lnPos = RAT("_", currentUrl)
IF lnPos > 0
lcBase = LEFT(currentUrl, lnPos)
lcPageNum = VAL(SUBSTR(currentUrl, lnPos+1))
IF lcPageNum > 0
lcNextUrl = lcBase + ALLTRIM(STR(lcPageNum + 1)) + ".html"
ENDIF
ENDIF
ELSE
* 如果当前是第一页,构造第二页
IF AT(".html", currentUrl) > 0
lcNextUrl = STRTRAN(currentUrl, ".html", "_2.html")
ENDIF
ENDIF
RETURN lcNextUrl
ENDFUNC
* 内容清理函数
FUNCTION CleanContent(lcContent)
LOCAL lcCleaned
lcCleaned = lcContent
* 移除作者提示和广告文本
lcCleaned = STRTRAN(lcCleaned, "【作者剑啸龙翔提示:如果章节内容错乱的话,关掉阅读模式即可正常】", "")
lcCleaned = STRTRAN(lcCleaned, "UU看书", "")
lcCleaned = STRTRAN(lcCleaned, "www., "")
lcCleaned = STRTRAN(lcCleaned, "请记住本书首发域名", "")
lcCleaned = STRTRAN(lcCleaned, "【首发.请记住网址(WWW.)】", "")
lcCleaned = STRTRAN(lcCleaned, "(WWW.)", "")
lcCleaned = STRTRAN(lcCleaned, "上一章 加入书签 下一章 ","")
lcCleaned = STRTRAN(lcCleaned, "[","")
lcCleaned = STRTRAN(lcCleaned, "]","")
* 清理HTML实体和多余空格
lcCleaned = STRTRAN(lcCleaned, " ", " ")
lcCleaned = STRTRAN(lcCleaned, CHR(13) + CHR(10) + CHR(13) + CHR(10), CHR(13) + CHR(10))
lcCleaned = STRTRAN(lcCleaned, CHR(10) + CHR(10), CHR(10))
* 整理文本格式
lcCleaned = STRTRAN(lcCleaned, " ", " ") && 减少空格数量
lcCleaned = ALLTRIM(lcCleaned)
RETURN lcCleaned
ENDFUNC
ENDDEFINE






