回复 39楼 zaixuexi
谢谢Z版赐教,继续学习你改进的代码。

梅尚程荀
马谭杨奚
程序代码:#include <stdio.h>
#include <winsock.h>
#include <string.h>
#include <afxinet.h>
#include <algorithm>
#include <afxdhtml.h>
#include <vector>
#pragma comment(lib, "ws2_32.lib")
CString geturl(char *url)
{
CString strHtml;
WSADATA WSAData={0};
SOCKET sockfd;
struct sockaddr_in addr;
struct hostent *pURL;
char myurl[BUFSIZ];
char *pHost = 0, *pGET = 0;
char host[BUFSIZ], GET[BUFSIZ];
char header[BUFSIZ] = "";
static char text[BUFSIZ];
int i;
/*
* windows下使用socket必须用WSAStartup初始化,否则不能调用
*/
if(WSAStartup(MAKEWORD(2,2), &WSAData))
{
printf("WSA failed\n");
return strHtml;
}
/*
* 分离url中的主机地址和相对路径
*/
strcpy(myurl, url);
for (pHost = myurl; *pHost != '/' && *pHost != '\0'; ++pHost);
if ( (int)(pHost - myurl) == strlen(myurl) )
strcpy(GET, "/");
else
strcpy(GET, pHost);
*pHost = '\0';
strcpy(host, myurl);
//printf("%s\n%s\n", host, GET);
/*
* 设定socket参数,并未真正初始化
*/
sockfd = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP);
pURL = gethostbyname(host);
addr.sin_family = AF_INET;
addr.sin_addr.s_addr = *((unsigned long*)pURL->h_addr);
addr.sin_port = htons(80);
/*
* 组织发送到web服务器的信息
* 为何要发送下面的信息请参考HTTP协议的约定
*/
strcat(header, "GET ");
strcat(header, GET);
strcat(header, " HTTP/1.1\r\n");
strcat(header, "HOST: ");
strcat(header, host);
strcat(header, "\r\nConnection: Close\r\n\r\n");
/*
* 连接到服务器,发送请求header,并接受反馈(即网页源代码)
*/
connect(sockfd,(SOCKADDR *)&addr,sizeof(addr));
send(sockfd, header, strlen(header), 0);
while ( recv(sockfd, text, BUFSIZ, 0) > 0)
{
//printf("%s", text);
strHtml += text;
strnset(text, '\0', BUFSIZ);
}
closesocket(sockfd);
WSACleanup();
return strHtml;
}
// 唯一的应用程序对象
CWinApp theApp;
using namespace std;
CString GetElementAttr(CComPtr<IHTMLElement> sp, CString strAttr)
{
VARIANT var;
BSTR bsAttr = strAttr.AllocSysString();
HRESULT hr = S_OK;
hr = sp->getAttribute(bsAttr, 0, &var);
::SysFreeString(bsAttr);
if (hr == S_OK && var.vt != VT_NULL)
return CString(var.bstrVal);
else
return CString("");
}
CString GetElementInnerHTML(CComPtr<IHTMLElement> sp)
{
BSTR bsHtml;
sp->get_innerHTML(&bsHtml);
return CString(bsHtml);
}
CString GetElementInnerText(CComPtr<IHTMLElement> sp)
{
BSTR bsText;
sp->get_innerText(&bsText);
return CString(bsText);
}
CString GetElementClassName(CComPtr<IHTMLElement> sp)
{
BSTR bsClass;
sp->get_className(&bsClass);
return CString(bsClass);
}
BOOL GetElementByCollection(CComPtr<IHTMLElementCollection> elementCollection, int nIndex, void** ppElem)
{
HRESULT hr = S_OK;
IDispatch *pDispInputText = NULL;
CComVariant vIndex=nIndex;
elementCollection->item(vIndex,vIndex,&pDispInputText);
hr=pDispInputText->QueryInterface(IID_IHTMLElement,ppElem);
return hr == S_OK;
}
CString GetElementTagName(CComPtr<IHTMLElement> sp)
{
BSTR bsAttr;
sp->get_tagName(&bsAttr);
return CString(bsAttr);
}
CString GetElementId(CComPtr<IHTMLElement> sp)
{
BSTR bsId;
sp->get_id(&bsId);
return CString(bsId);
}
void GetTopDocumentFromUrl(CString url,CComQIPtr<IHTMLDocument2>& pDoc)
{
CoInitialize(NULL);
CString strHtml = geturl(url.GetBuffer());
BSTR bs = strHtml.AllocSysString();
HRESULT hr = CoCreateInstance(CLSID_HTMLDocument, NULL, CLSCTX_INPROC_SERVER,IID_IHTMLDocument2, (void**)&pDoc.p);
SAFEARRAY* psa = SafeArrayCreateVector(VT_VARIANT,0,1);
VARIANT* param;
hr = SafeArrayAccessData(psa, (LPVOID*)¶m);
param->vt = VT_BSTR;
param->bstrVal = bs;
pDoc->write(psa);
::SysFreeString(bs);
}
struct ShiCi
{
CString strName;
CString strWriter;
CString strContext;
};
BOOL GetShiCiFromUrl(CString strUrl, ShiCi & sc)
{
HRESULT hr = S_OK;
CComQIPtr<IHTMLDocument2> pDoc;
GetTopDocumentFromUrl(strUrl, pDoc);
CComQIPtr<IHTMLElement> spTop;
pDoc->get_body(&spTop);
CComQIPtr<IHTMLElementCollection> spCol;
CComPtr<IDispatch> pDispatch;
hr = spTop->get_all(&pDispatch);
hr = pDispatch->QueryInterface(IID_IHTMLElementCollection, (VOID**)&spCol);
long len;
spCol->get_length(&len);
int num = 0;
for (int c=0; c<len; c++)
{
CComPtr<IHTMLElement> spP1;
if (GetElementByCollection(spCol, c, (void**)&spP1) && GetElementClassName(spP1) == "HeightBorderCenter")
{
if (num == 0)
{
sc.strName = GetElementInnerText(spP1);
}
else if (num == 1)
{
sc.strWriter = GetElementInnerText(spP1);
}
else if (num == 2)
{
sc.strContext = GetElementInnerText(spP1);
}
num ++;
}
}
if (num < 3)
return FALSE;
return TRUE;
}
int _tmain(int argc, TCHAR* argv[], TCHAR* envp[])
{
int nRetCode = 0;
// 初始化 MFC 并在失败时显示错误
if (!AfxWinInit(::GetModuleHandle(NULL), NULL, ::GetCommandLine(), 0))
{
// TODO: 更改错误代码以符合您的需要
_tprintf(_T("错误: MFC 初始化失败\n"));
nRetCode = 1;
}
else
{
// TODO: 在此处为应用程序的行为编写代码。
}
std::vector<ShiCi> arrShiCi;
int nStart = 101;
int nEnd = 140;
for (int i=nStart; i<nEnd; i++)
{
ShiCi sc;
CString strUrl;
strUrl.Format("www.", i);
if (GetShiCiFromUrl(strUrl, sc))
arrShiCi.push_back(sc);
system("cls");
printf("共%d个,已经完成%d个,获取%d个诗词\n", nEnd-nStart, i-nStart+1, arrShiCi.size());
//////////////////////////////////////////////////////////////////////////
// 这么保存保险一点
FILE* fp = fopen("out.xml", "wt");
if (fp == NULL)
return 0;
fprintf(fp, "<?xml version=\"1.0\" encoding=\"gb2312\"?>\n");
fprintf(fp, "<root>\n");
for (int j=0; j<arrShiCi.size(); j++)
{
fprintf(fp, "<sc%d>\n", j);
fprintf(fp, "<Name>\n");
fprintf(fp, "%s\n", arrShiCi[j].strName);
fprintf(fp, "</Name>\n");
fprintf(fp, "<Writer>\n");
fprintf(fp, "%s\n", arrShiCi[j].strWriter);
fprintf(fp, "</Writer>\n");
fprintf(fp, "<Context>\n");
fprintf(fp, "%s\n", arrShiCi[j].strContext);
fprintf(fp, "</Context>\n");
fprintf(fp, "</sc%d>\n", j);
}
fprintf(fp, "</root>\n");
fclose(fp);
}
return nRetCode;
}

