http://luoye1989hzx.blog.163.com/blog/static/1699218892010828364066/
本文引用自luoye1989《htmlparser实现从网页上抓取数据(收集)》
package parser; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; /** * 基本能实现网页抓取,不过要手动输入URL 将整个html内容保存到指定文件 * * @author chenguoyong * */ public class ScrubSelectedWeb { private final static String CRLF = System.getProperty("line.separator"); /** * @param args */ public static void main(String[] args) { try { URL ur = new URL("http://10.249.187.199:8083/injs100/"); InputStream instr = ur.openStream(); String s, str; BufferedReader in = new BufferedReader(new InputStreamReader(instr)); StringBuffer sb = new StringBuffer(); BufferedWriter out = new BufferedWriter(new FileWriter( "D:/outPut.txt")); while ((s = in.readLine()) != null) { sb.append(s + CRLF); } System.out.println(sb); str = new String(sb); out.write(str); out.close(); in.close(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } 基本能实现网页抓取,不过要手动输入URL,此外没有重构。只是一个简单的思路。 1.htmlparser 使用 htmlparser是一个纯的java写的html解析的库,htmlparser不依赖于其它的java库,htmlparser主要用于改造 或提取html。htmlparser能超高速解析html,而且不会出错。毫不夸张地说,htmlparser就是目前最好的html解 析和分析的工具。无论你是想抓取网页数据还是改造html的内容,用了htmlparser绝对会忍不住称赞。由于htmlparser 结构设计精良,所以扩展htmlparser 非常便利。 Htmlparser中文论坛. http://bbs.hexiao.cn/thread.php?fid=6 Constructor Summary Parser() Parser(URLConnection connection) Construct a parser using the provided URLConnection. Method: static Parser createParser(String html, String charset) Creates the parser on an input string. void visitAllNodesWith(NodeVisitor visitor) Apply the given visitor to the current page. HtmlPage(Parser parser) NodeList getBody() TableTag[] getTables() String getTitle() void setTitle(String title) void visitTag(Tag tag) Called for each Tag visited. Constructor Summary NodeList() NodeList(Node node) Create a one element node list. NodeList extractAllNodesThatMatch(NodeFilter filter) Filter the list with the given filter non-recursively. NodeList extractAllNodesThatMatch(NodeFilter filter, boolean recursive) Filter the list with the given filter. Node elementAt(int i) 1. html代码里面所有的链接地址和链接名称 package parser; import org.htmlparser.Parser; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.tags.LinkTag; import org.htmlparser.tags.TableTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.visitors.HtmlPage; /** * htmlparser取得一段html代码里面所有的链接地址和链接名称 * * @author chenguoyong * */ public class Testhtmlparser { /** * @param args */ public static void main(String[] args) { String htmlcode = "<HTML><HEAD><TITLE>AAA</TITLE></HEAD><BODY>" + "<a href='http://topic.csdn.net/u/20080522/14/0ff402ef-c382-499a-8213-ba6b2f550425.html'>连接1</a>" + "<a href='http://topic.csdn.net'>连接2</a></BODY></HTML>"; // 创建Parser对象根据传给字符串和指定的编码 Parser parser = Parser.createParser(htmlcode, "GBK"); // 创建HtmlPage对象HtmlPage(Parser parser) HtmlPage page = new HtmlPage(parser); try { // HtmlPage extends visitor,Apply the given visitor to the current // page. parser.visitAllNodesWith(page); } catch (ParserException e1) { e1 = null; } // 所有的节点 NodeList nodelist = page.getBody(); // 建立一个节点filter用于过滤节点 NodeFilter filter = new TagNameFilter("A"); // 得到所有过滤后,想要的节点 nodelist = nodelist.extractAllNodesThatMatch(filter, true); for (int i = 0; i < nodelist.size(); i++) { LinkTag link = (LinkTag) nodelist.elementAt(i); // 链接地址 System.out.println(link.getAttribute("href") + "\n"); // 链接名称 System.out.println(link.getStringText()); } } } 结果如下: http://topic.csdn.net/u/20080522/14/0ff402ef-c382-499a-8213-ba6b2f550425.html 连接1 http://topic.csdn.net 连接2 2. 使用HtmlParser抓去网页内容 package parser; import org.htmlparser.Parser; import org.htmlparser.beans.StringBean; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.parserapplications.StringExtractor; import org.htmlparser.tags.BodyTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; /** * 使用HtmlParser抓去网页内容: 要抓去页面的内容最方便的方法就是使用StringBean. 里面有几个控制页面内容的几个参数. * 在后面的代码中会有说明. Htmlparser包中还有一个示例StringExtractor 里面有个直接得到内容的方法, * 其中也是使用了StringBean . 另外直接解析Parser的每个标签也可以的. * * @author chenguoyong * */ public class GetContent { public void getContentUsingStringBean(String url) { StringBean sb = new StringBean(); sb.setLinks(true); // 是否显示web页面的连接(Links) // 为了取得页面的整洁美观一般设置上面两项为true , 如果要保持页面的原有格式, 如代码页面的空格缩进 可以设置为false sb.setCollapse(true); // 如果是true的话把一系列空白字符用一个字符替代. sb.setReplaceNonBreakingSpaces(true);// If true regular space sb .setURL("http://www.blogjava.net/51AOP/archive/2006/07/19/59064.html"); System.out.println("The Content is :\n" + sb.getStrings()); } public void getContentUsingStringExtractor(String url, boolean link) { // StringExtractor内部机制和上面的一样.做了一下包装 StringExtractor se = new StringExtractor(url); String text = null; try { text = se.extractStrings(link); System.out.println("The content is :\n" + text); } catch (ParserException e) { e.printStackTrace(); } } public void getContentUsingParser(String url) { NodeList nl; try { Parser p = new Parser(url); nl = p.parse(new NodeClassFilter(BodyTag.class)); BodyTag bt = (BodyTag) nl.elementAt(0); System.out.println(bt.toPlainTextString()); // 保留原来的内容格式. 包含js代码 } catch (ParserException e) { e.printStackTrace(); } } /** * @param args */ public static void main(String[] args) { String url = "http://www.blogjava.net/51AOP/archive/2006/07/19/59064.html"; //new GetContent().getContentUsingParser(url); //-------------------------------------------------- new GetContent().getContentUsingStringBean(url); } } 3.将整个html内容保存到指定文件 package parser; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; /** * 基本能实现网页抓取,不过要手动输入URL 将整个html内容保存到指定文件 * * @author chenguoyong * */ public class ScrubSelectedWeb { private final static String CRLF = System.getProperty("line.separator"); /** * @param args */ public static void main(String[] args) { try { URL ur = new URL("http://www.google.cn/"); InputStream instr = ur.openStream(); String s, str; BufferedReader in = new BufferedReader(new InputStreamReader(instr)); StringBuffer sb = new StringBuffer(); BufferedWriter out = new BufferedWriter(new FileWriter( "D:/outPut.txt")); while ((s = in.readLine()) != null) { sb.append(s + CRLF); } System.out.println(sb); str = new String(sb); out.write(str); out.close(); in.close(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } 4利用htmlparser提取网页纯文本的例子 package parser; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.tags.TableTag; import org.htmlparser.util.NodeList; /** * 标题:利用htmlparser提取网页纯文本的例子 */ public class TestHTMLParser2 { /** * 读取目标html内容 * */ public static void testHtml() { try { String sCurrentLine; String sTotalString; sCurrentLine = ""; sTotalString = ""; java.io.InputStream l_urlStream; java.net.URL l_url = new java.net.URL( "http://10.249.187.199:8083/injs100/"); java.net.HttpURLConnection l_connection = (java.net.HttpURLConnection) l_url .openConnection(); l_connection.connect(); l_urlStream = l_connection.getInputStream(); java.io.BufferedReader l_reader = new java.io.BufferedReader( new java.io.InputStreamReader(l_urlStream)); while ((sCurrentLine = l_reader.readLine()) != null) { sTotalString += sCurrentLine + "\r\n"; } String testText = extractText(sTotalString); } catch (Exception e) { e.printStackTrace(); } } /** * 抽取纯文本信息 * @param inputHtml:html文本 * @return * @throws Exception */ public static String extractText(String inputHtml) throws Exception { StringBuffer text = new StringBuffer(); Parser parser = Parser.createParser(new String(inputHtml.getBytes(), "GBK"), "GBK"); // 遍历所有的节点 NodeList nodes = parser.extractAllNodesThatMatch(new NodeFilter() { public boolean accept(Node node) { return true; } }); System.out.println(nodes.size()); for (int i = 0; i < nodes.size(); i++) { Node nodet = nodes.elementAt(i); //字符串的代表性节点:节点的描述 text.append(new String(nodet.toPlainTextString().getBytes("GBK")) + "\r\n"); } return text.toString(); } /** * 读取文件的方式/utl 来分析内容. filePath也可以是一个Url. * @param resource :文件/Url * @throws Exception */ public static void test5(String resource) throws Exception { Parser myParser = new Parser(resource); myParser.setEncoding("GBK"); String filterStr = "table"; NodeFilter filter = new TagNameFilter(filterStr); NodeList nodeList = myParser.extractAllNodesThatMatch(filter); /*for(int i=0;i<nodeList.size();i++) { TableTag tabletag = (TableTag) nodeList.elementAt(i); //标签名称 System.out.println(tabletag.getTagName()); System.out.println(tabletag.getText()); }*/ TableTag tabletag = (TableTag) nodeList.elementAt(1); } public static void main(String[] args) throws Exception { test5("http://10.249.187.199:8083/injs100/"); //testHtml(); } } 5.html解析table package parser; import org.apache.log4j.Logger; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.filters.OrFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.tags.TableColumn; import org.htmlparser.tags.TableRow; import org.htmlparser.tags.TableTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import junit.framework.TestCase; public class ParserTestCase extends TestCase { private static final Logger logger = Logger.getLogger(ParserTestCase.class); public ParserTestCase(String name) { super(name); } /** * 测试对<table> * <tr> * <td></td> * </tr> * </table>的解析 */ public void testTable() { Parser myParser; NodeList nodeList = null; myParser = Parser .createParser( "<body> " + "<table id=’table1′ >" + "<tr id='tro1'><td>1-11</td><td>1-12</td><td>1-13</td></tr>" + "<tr id='tro2'><td>1-21</td><td>1-22</td><td>1-23</td></tr>" + "<tr id='tro3'><td>1-31</td><td>1-32</td><td>1-33</td></tr></table>" + "<table id=’table2′ >" + "<tr id='tro4'><td>2-11</td><td>2-12</td><td>2-13</td></tr>" + "<tr id='tro5'><td>2-21</td><td>2-22</td><td>2-23</td></tr>" + "<tr id='tro6'><td>2-31</td><td>2-32</td><td>2-33</td></tr></table>" + "</body>", "GBK"); NodeFilter tableFilter = new NodeClassFilter(TableTag.class); OrFilter lastFilter = new OrFilter(); lastFilter.setPredicates(new NodeFilter[] { tableFilter }); try { nodeList = myParser.parse(lastFilter); for (int i = 0; i <= nodeList.size(); i++) { if (nodeList.elementAt(i) instanceof TableTag) { TableTag tag = (TableTag) nodeList.elementAt(i); TableRow[] rows = tag.getRows(); for (int j = 0; j < rows.length; j++) { TableRow tr = (TableRow) rows[j]; System.out.println(tr.getAttribute("id")); if (tr.getAttribute("id").equalsIgnoreCase("tro1")) { TableColumn[] td = tr.getColumns(); for (int k = 0; k < td.length; k++) { // logger.fatal("<td>" + // td[k].toPlainTextString()); System.out.println("<td>" + td[k].toPlainTextString()); } } } } } } catch (ParserException e) { e.printStackTrace(); } } /** * 得到目标数据 * * @param url:目标url * @throws Exception */ public static void getDatabyUrl(String url) throws Exception { Parser myParser = new Parser(url); NodeList nodeList = null; myParser.setEncoding("gb2312"); NodeFilter tableFilter = new NodeClassFilter(TableTag.class); OrFilter lastFilter = new OrFilter(); lastFilter.setPredicates(new NodeFilter[] { tableFilter }); try { nodeList = myParser.parse(lastFilter); // 可以从数据table的size:19-21开始到结束 for (int i = 15; i <= nodeList.size(); i++) { if (nodeList.elementAt(i) instanceof TableTag) { TableTag tag = (TableTag) nodeList.elementAt(i); TableRow[] rows = tag.getRows(); for (int j = 0; j < rows.length; j++) { TableRow tr = (TableRow) rows[j]; if (tr.getAttribute("id") != null && tr.getAttribute("id").equalsIgnoreCase( "tr02")) { TableColumn[] td = tr.getColumns(); // 对不起,没有你要查询的记录! if (td.length == 1) { System.out.println("对不起,没有你要查询的记录"); } else { for (int k = 0; k < td.length; k++) { System.out.println("<td>内容:" + td[k].toPlainTextString().trim()); } } } } } } } catch (ParserException e) { e.printStackTrace(); } } /** * 测试已经得出有数据时table:22个,没有数据时table:19个 * * @param args */ public static void main(String[] args) { try { // getDatabyUrl("http://gd.12530.com/user/querytonebytype.do?field=tonecode&condition=619505000000008942&type=1006&pkValue=619505000000008942"); getDatabyUrl("http://gd.12530.com/user/querytonebytype.do?field=tonecode&condition=619272000000001712&type=1006&pkValue=619272000000001712"); } catch (Exception e) { e.printStackTrace(); } } } 6.html解析常用 package com.jscud.test; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.InputStreamReader; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.filters.OrFilter; import org.htmlparser.nodes.TextNode; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.visitors.HtmlPage; import org.htmlparser.visitors.TextExtractingVisitor; import com.jscud.util.LogMan; //一个日志记录类 /** * 演示了Html Parse的应用. * * @author scud http://www.jscud.com (http://www.jscud.com/) */ public class ParseHtmlTest { public static void main(String[] args) throws Exception { String aFile = "e:/jscud/temp/test.htm"; String content = readTextFile(aFile, "GBK"); test1(content); System.out.println("===================================="); test2(content); System.out.println("===================================="); test3(content); System.out.println("===================================="); test4(content); System.out.println("===================================="); test5(aFile); System.out.println("===================================="); //访问外部资源,相对慢 test5("http://www.jscud.com (http://www.jscud.com/)"); System.out.println("===================================="); } /** * 读取文件的方式来分析内容. * filePath也可以是一个Url. * * @param resource 文件/Url */ public static void test5(String resource) throws Exception { Parser myParser = new Parser(resource); //设置编码 myParser.setEncoding("GBK"); HtmlPage visitor = new HtmlPage(myParser); myParser.visitAllNodesWith(visitor); String textInPage = visitor.getTitle(); System.out.println(textInPage); } /** * 按页面方式处理.对一个标准的Html页面,推荐使用此种方式. */ public static void test4(String content) throws Exception { Parser myParser; myParser = Parser.createParser(content, "GBK"); HtmlPage visitor = new HtmlPage(myParser); myParser.visitAllNodesWith(visitor); String textInPage = visitor.getTitle(); System.out.println(textInPage); } /** * 利用Visitor模式解析html页面. * * 小优点:翻译了<>等符号 * 缺点:好多空格,无法提取link * */ public static void test3(String content) throws Exception { Parser myParser; myParser = Parser.createParser(content, "GBK"); TextExtractingVisitor visitor = new TextExtractingVisitor(); myParser.visitAllNodesWith(visitor); String textInPage = visitor.getExtractedText(); System.out.println(textInPage); } /** * 得到普通文本和链接的内容. * * 使用了过滤条件. */ public static void test2(String content) throws ParserException { Parser myParser; NodeList nodeList = null; myParser = Parser.createParser(content, "GBK"); NodeFilter textFilter = new NodeClassFilter(TextNode.class); NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); //暂时不处理 meta //NodeFilter metaFilter = new NodeClassFilter(MetaTag.class); OrFilter lastFilter = new OrFilter(); lastFilter.setPredicates(new NodeFilter[] { textFilter, linkFilter }); nodeList = myParser.parse(lastFilter); Node[] nodes = nodeList.toNodeArray(); for (int i = 0; i < nodes.length; i++) { Node anode = (Node) nodes[i]; String line = ""; if (anode instanceof TextNode) { TextNode textnode = (TextNode) anode; //line = textnode.toPlainTextString().trim(); line = textnode.getText(); } else if (anode instanceof LinkTag) { LinkTag linknode = (LinkTag) anode; line = linknode.getLink(); //@todo ("") 过滤jsp标签:可以自己实现这个函数 //line = StringFunc.replace(line, "<%.*%>", ""); } if (isTrimEmpty(line)) continue; System.out.println(line); } } /** * 解析普通文本节点. * * @param content * @throws ParserException */ public static void test1(String content) throws ParserException { Parser myParser; Node[] nodes = null; myParser = Parser.createParser(content, null); nodes = myParser.extractAllNodesThatAre(TextNode.class); //exception could be thrown here for (int i = 0; i < nodes.length; i++) { TextNode textnode = (TextNode) nodes[i]; String line = textnode.toPlainTextString().trim(); if (line.equals("")) continue; System.out.println(line); } } /** * 读取一个文件到字符串里. * * @param sFileName 文件名 * @param sEncode String * @return 文件内容 */ public static String readTextFile(String sFileName, String sEncode) { StringBuffer sbStr = new StringBuffer(); try { File ff = new File(sFileName); InputStreamReader read = new InputStreamReader(new FileInputStream(ff), sEncode); BufferedReader ins = new BufferedReader(read); String dataLine = ""; while (null != (dataLine = ins.readLine())) { sbStr.append(dataLine); sbStr.append("\r\n"); } ins.close(); } catch (Exception e) { LogMan.error("read Text File Error", e); } return sbStr.toString(); } /** * 去掉左右空格后字符串是否为空 * @param astr String * @return boolean */ public static boolean isTrimEmpty(String astr) { if ((null == astr) || (astr.length() == 0)) { return true; } if (isBlank(astr.trim())) { return true; } return false; } /** * 字符串是否为空:null或者长度为0. * @param astr 源字符串. * @return boolean */ public static boolean isBlank(String astr) { if ((null == astr) || (astr.length() == 0)) { return true; } else { return false; } } }