注册 登录
编程论坛 VFP论坛

求助吹版:下列程序为什么不能下载各章节小说内容

王咸美 发布于 6 天前 11:12, 223 次点击
下列程序为什么不能下载各章节小说内容,仅能爬取各章节网址。请吹版示教,万事感谢!!!
只有本站会员才能查看附件,请 登录

只有本站会员才能查看附件,请 登录

具体代码如下(待修改)

SET DEFAULT TO (ADDBS(JUSTPATH(SYS(16))))
DECLARE long PostMessageA IN user32 long,long,long,long
IF !FILE("仙道九绝.dbf")
    CREATE TABLE 仙道九绝 (title C(100), addr C(254), txt M)
    USE
ENDIF
USE 仙道九绝 ALIAS tu
of = CREATEOBJECT("form1")
of.show(1)
CLOSE TABLES ALL
CLEAR ALL
RETURN

DEFINE CLASS form1 as Form
    width = 800
    height = 600
    AutoCenter = .T.
    AllowOutput = .f.
    ADD OBJECT but as commandbutton WITH left=10,top=10,width=100,height=22,caption="开始"
    ADD OBJECT grd as grid WITH left=10,top=40,width=250,height=550,RecordSource="tu",AllowCellSelection=.f.
    ADD OBJECT edt as editbox WITH left=280,top=40,width=510,height=550
    ADD OBJECT web as Olecontrol with OleClass="Shell.Explorer.2",left=-100

    PROCEDURE Destroy
        UNBINDEVENTS(this.hWnd)
    ENDPROC
   
    PROCEDURE Init
        this.web.Silent = .t.
        BINDEVENT(this.hWnd, 0x401, this, "myMessage")
    ENDPROC

    PROCEDURE but.click
        ZAP IN "tu"
        thisform.edt.value = " "
        thisform.web.navigate("http://www.)
    ENDPROC
   
    PROCEDURE grd.click
        thisform.edt.value = tu.txt
        thisform.edt.SelStart = 0
    ENDPROC

    PROCEDURE web.documentComplete(pdisp, url)
        IF (SYS(3095, pdisp) == SYS(3095, this))
            PostMessageA(thisform.hWnd, 0x401, 0, 0)
        ENDIF
    ENDPROC

    FUNCTION myMessage(hWnd, uMsg, wParam, lParam)
        dom = this.web.document
        IF VARTYPE(dom) != "O"
            this.Enabled = .t.
            RETURN
        ENDIF
        IF EMPTY(tu.addr)
          *  #if 1   && 0 or 1
                * ie版本高试用
          *  lis = dom.getElementsByClassName("line3")
          *  FOR EACH li IN lis
          *      INSERT INTO tu VALUES (li.firstChild.innertext, li.firstChild.href, "")
          *  ENDFOR
          *  #else
                * ie版本低试用
            lis = dom.getElementsByTagName("li")
           FOR EACH li IN lis   
              IF li.classname == "line3"
                    INSERT INTO tu VALUES (;
                        li.getElementsByTagName("a").item[0].innertext,;
                        li.getElementsByTagName("a").item[0].href, "";
                  )
                ENDIF
            ENDFOR
          *  #endif
            GO TOP IN "tu"
            this.grd.setfocus
            this.web.navigate(ALLTRIM(tu.addr))
            RETURN     
        ENDIF
        REPLACE tu.txt WITH ALLTRIM(tu.title)+ 0h0D0A0D0A + dom.getElementById("ad").innertext
        ? ALLTRIM(tu.title)
        SKIP IN "tu"
        IF !EOF("tu")
            this.grd.setfocus
            this.web.navigate(ALLTRIM(tu.addr))
            RETURN
        ENDIF
        GO TOP IN "tu"
        this.grd.setfocus
        this.Enabled = .t.
    ENDFUNC
ENDDEFINE

10 回复
#2
王咸美6 天前 12:26
只有本站会员才能查看附件,请 登录
#3
吹水佬6 天前 13:50
我的代码测试环境:windows 10 专业版  VFP9。
再次强调:没有通式,不能照抄。
你的贴好像也回复过几种不同的方式方法。
如你真想玩转,必须学点HTML、JS。
想玩得神学点HTTP协议。
想玩得癫学点TCP/IP协议。


#4
yiyanxiyin6 天前 17:06
xp下可是使用MSXML2.XMLHTTP.3.0组件 这个组件和IE没关系,即使IE不能正常显示的网页也没问题,通过这个组件将网页内容下载下来,然后作为文本分析,使用正则提取小说内容
#5
sam_jiang6 天前 21:47
为什么我用ohtml=createobject("htmlfile")会报错,提示找不到htmlfile类定义???
#6
吹水佬6 天前 21:59
回复 楼主 王咸美
看了一下网页关键标签和之前的一样,看你贴的图也完成两章,有时卡可能是网络连接慢等等看。
只有本站会员才能查看附件,请 登录

改了一下按每章内容保存到文件,这样会安全点。
程序代码:

CLEAR
CLEAR ALL
CLOSE TABLES ALL
SET TALK OFF
SET SAFETY OFF
DECLARE long PostMessageA IN user32 long,long,long,long
cDefPath = ADDBS(JUSTPATH(SYS(16)))
SET DEFAULT TO (cDefPath)
** 爬网页:
**    web_url   = "http://www./105790648"
**    file_name = "吃喝玩乐之重生1997"   
web_url   = "http://www./6780946"
file_name = "仙道九绝"
file_dir  = cDefPath + file_name + "\"
IF !DIRECTORY(file_dir)
    MD (file_dir)
ENDIF
file_name = file_dir + file_name + ".dbf"
IF !FILE(file_name)
    CREATE TABLE (file_name) (title C(100), addr C(254))
    USE
ENDIF
USE (file_name) ALIAS tu
of = CREATEOBJECT("form1")
of.show(1)
CLOSE TABLES ALL
CLEAR ALL
RETURN

DEFINE CLASS form1 as Form
    width = 800
    height = 600
    AutoCenter = .T.
    AllowOutput = .f.
    ADD OBJECT but as commandbutton WITH left=10,top=10,width=100,height=22,caption="开始"
    ADD OBJECT grd as grid WITH left=10,top=40,width=250,height=550,RecordSource="tu",AllowCellSelection=.f.
    ADD OBJECT edt as editbox WITH left=280,top=40,width=510,height=550
    ADD OBJECT web as Olecontrol with OleClass="Shell.Explorer.2",left=-1000

    PROCEDURE Destroy
        UNBINDEVENTS(this.hWnd)
    ENDPROC
   
    PROCEDURE Init
        this.web.Silent = .t.
        BINDEVENT(this.hWnd, 0x401, this, "myMessage")
    ENDPROC

    PROCEDURE but.click
        ZAP IN "tu"
        thisform.edt.value = ""
        thisform.web.navigate(web_url)
    ENDPROC
   
    PROCEDURE grd.click
        thisform.edt.value = FILETOSTR(file_dir + ALLTRIM(tu.title) + ".txt")
        thisform.edt.SelStart = 0
    ENDPROC

    PROCEDURE web.documentComplete(pdisp, url)
        IF (SYS(3095, pdisp) == SYS(3095, this))
            PostMessageA(thisform.hWnd, 0x401, 0, 0)
        ENDIF
    ENDPROC

    FUNCTION myMessage(hWnd, uMsg, wParam, lParam)
        dom = this.web.document
        IF VARTYPE(dom) != "O"
            this.Enabled = .t.
            RETURN
        ENDIF
        IF EMPTY(tu.addr)
            #if 1   && 0 or 1
                * ie版本高试用
            lis = dom.getElementsByClassName("line3")
            FOR EACH li IN lis
                INSERT INTO tu VALUES (li.firstChild.innertext, li.firstChild.href)
            ENDFOR
            #else
                * ie版本低试用
            lis = dom.getElementsByTagName("li")
            FOR EACH li IN lis
                IF li.classname == "line3"
                    INSERT INTO tu VALUES (;
                        li.getElementsByTagName("a").item[0].innertext,;
                        li.getElementsByTagName("a").item[0].href;
                    )
                ENDIF
            ENDFOR
            #endif
            GO TOP IN "tu"
            this.grd.setfocus
            this.web.navigate(ALLTRIM(tu.addr))
            RETURN     
        ENDIF
        fn = file_dir + ALLTRIM(tu.title) + ".txt"
        STRTOFILE(ALLTRIM(tu.title)+ 0h0D0A0D0A + dom.getElementById("ad").innertext, fn)
        ? ALLTRIM(tu.title)
        SKIP IN "tu"
        IF !EOF("tu")
            this.grd.setfocus
            this.web.navigate(ALLTRIM(tu.addr))
            RETURN
        ENDIF
        GO TOP IN "tu"
        this.grd.setfocus
        this.Enabled = .t.
    ENDFUNC
ENDDEFINE
#7
吹水佬5 天前 00:08
以下是引用sam_jiang在2025-11-22 21:47:22的发言:

为什么我用ohtml=createobject("htmlfile")会报错,提示找不到htmlfile类定义???

ie能用吗?
#8
王咸美5 天前 03:20
@吹水佬:谢谢!!!
#9
sam_jiang5 天前 11:59
回复 7楼 吹水佬
能用,电脑上有mshtml.dll组件呀
#10
吹水佬5 天前 14:17
以下是引用sam_jiang在2025-11-23 11:59:50的发言:

能用,电脑上有mshtml.dll组件呀

ieframe.dll


#11
王咸美5 天前 18:45
谢谢!但对windows xp系统,低版本IE浏览器真的无法下载,况且有的章节有三页或两页的情况。

[此贴子已经被作者于2025-11-23 19:26编辑过]

1