如何用VFP将《仙道九绝》小说爬下来?
如何用VFP将《仙道九绝》小说爬下来?网址:https://www.
要求:1. 将小说各章节完整爬下来;
2. 将小说正文完整爬下来。
请高手赐教,万分感谢!!!(主要是学习代码的编写)
程序代码:
DECLARE long URLDownloadToFileA IN urlmon long,string,string,long,long
DECLARE long DeleteUrlCacheEntry IN wininet string
dom = CREATEOBJECT("htmlfile")
* 获取内容(示例只取第一章内容,其他章节类推)
content = getContent("https://www./10070880/10116202.html")
* 显示内容
tmpFile = "C:\_temp\content.txt"
STRTOFILE(content, tmpFile)
MODIFY FILE (tmpFile)
DELETE FILE (tmpFile)
CLEAR ALL
RETURN
FUNCTION getContent(url)
LOCAL cHtml,titles, content
* 取网页
cHtml = STRCONV(getHtml(url), 11)
* 载入网页解析
dom.write(cHtml)
* 取标题
titles = dom.getElementsByTagName("h1")
content = ""
FOR EACH title IN titles
IF title.classname == "title"
content = title.innertext + 0h0D0A0D0A
EXIT
ENDIF
ENDFOR
* 取第一页内容
content = content + dom.getElementById("content").innertext + 0h0D0A0D0A
dom.close
* 取第二页内容
url = JUSTPATH(url)+ "/" + JUSTSTEM(url) + "_2.html"
cHtml = STRCONV(getHtml(url), 11)
dom.write(cHtml)
content = content + dom.getElementById("content").innertext + 0h0D0A0D0A
dom.close
RETURN content
ENDFUNC
FUNCTION getHtml(url)
LOCAL tmpHtml
tmpHtml = "C:\_temp\tmp.html" && 临时文件
DeleteUrlCacheEntry(url)
IF URLDownloadToFileA(0, url, tmpHtml, 0, 0)==0
ret = FILETOSTR(tmpHtml)
DELETE FILE (tmpHtml)
RETURN ret
ENDIF
? "下载 " + url + " 失败"
RETURN ""
ENDFUNC