请教网页数据抓取问题
请教各位大侠一个问题:如何才能把百度百科中的词条解释页面最下面的“相关词条”中的词条抓取到自己的网站页面上,包括词条和链接,在自己网站页面上点击词条之后就能链接到百度百科的词条解释页面。例如:http://baike.baidu.com/view/598725.htm
程序代码:package org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class TestHtmlparser {
/**
* @author rrong_m
* @throws ParserException
*/
public static void getWords(String url) throws ParserException {
Parser parser = new Parser(url);
NodeFilter filter = new HasAttributeFilter("id", "word_more_con");
NodeList nodelist = parser.extractAllNodesThatMatch(filter);
NodeFilter filter1 = new NodeClassFilter(LinkTag.class);
nodelist = nodelist.extractAllNodesThatMatch(filter1, true);
for (int i = 0; i < nodelist.size(); i++) {
LinkTag link = (LinkTag) nodelist.elementAt(i);
System.out.println(link.getLinkText() + ":" + link.getLink());
}
}
public static void main(String[] args) throws ParserException {
getWords("http://baike.baidu.com/view/598725.htm");
}
}
