讀取頁面數(shù)據(jù)的例子

vchengyun 2007-04-03

展開全文

讀取頁面數(shù)據(jù)的例子

作者：失衡的天秤出處：天極Chinabyte

[ 2005-11-16 13:49 ]

前幾天剛完成對網(wǎng)頁數(shù)據(jù)抓取的工作，對些想法。下面的例子作為一個參照（不是工作的源CODE），供賞。

package com.zjf.websplider;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.net.Socket;
import java.net.URL;
import java.util.Properties;

import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;

/**
* Web頁面抓取,負責從頁面上抓取相應的信息
*
* @author Administrator
*/

public class WebSplider
{
/**
* 根據(jù)請求的Url路徑,和輸出流進行模擬Http請求,得到頁面數(shù)據(jù)
*
* @throws Exception
*/
public static void getURL(String url, OutputStream out) throws Exception
{
// 創(chuàng)建URL對象
URL http_URL = new URL(url);
// 請求協(xié)議
String protocol = http_URL.getProtocol(）;
// 服務(wù)器主機名稱
String host = http_URL.getHost(）;
// 請求的文件名稱
String filename = http_URL.getFile(）;
// 端口
int port = http_URL.getPort(）;

// 如果不是http請求
if (!protocol.equals("http"））
{
// 拋出異常
throw new IllegalArgumentException("僅僅支持http請求協(xié)議"）;
}
// 請求的服務(wù)器主機名稱為null
if (host == null) { throw new IllegalArgumentException("無效的服務(wù)主機帳戶"); }
// 如果服務(wù)器端口等于-1
if (port == -1)
{
// 則使用默認的端口80
port = 80;
}
// 如果請求的文件不存在
if (filename == "")
{
// 則,路徑默認到服務(wù)器的當前目錄下
filename = "/";
}
// 建立底層的Socket通訊
Socket socket = new Socket(host, port）;
// 得到輸入流數(shù)據(jù)
InputStream from = socket.getInputStream(）;
// 構(gòu)造http請求頭
PrintWriter to = new PrintWriter(socket.getOutputStream()）;
// 設(shè)置http請求頭類型及信息
to.print("GET " + filename + " HTTP/1.0\n"）;
to.print("Accept: */*\n");
to.print("Accept-Language: zh-cn\n");
to.print("Accept-Encoding: gb2312, deflate\n"）;
to
.print("User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)\n"）;
to.print("Host: " + http_URL.getHost() + "\n"）;
to.print("Connection: Keep-Alive\n\n"）;
to.flush(）;
byte[] buf = new byte[4096];
int bytes_read;
// 讀取數(shù)據(jù)流
while ((bytes_read = from.read(buf)) != -1)
{
out.write(buf, 0, bytes_read）;
}
// 關(guān)閉socket通訊,及數(shù)據(jù)流對象
socket.close(）;
out.close(）;
}

/**
* 得到請求的頁面信息
*/
public static String getRequestPage(String url)
{
ByteArrayOutputStream outStream = new ByteArrayOutputStream(）;
// 返回的html代碼
String html = "";
try
{
// 要讀取的頁面地址
getURL(url, outStream）;
html = outStream.toString().toLowerCase(）;
html = getList(html).toLowerCase(）;
}
catch (IOException e)
{
System.out.println("寫入文件錯誤" + e.getMessage()）;
}
catch (Exception e)
{
e.printStackTrace(）;
}
return html;
}

/**
* 截取HTML標簽中的內(nèi)容
*
* @param html
* @return
*/
public static String getList(String html)
{
int StartPos = -1;
int EndPos = -1;
StartPos = html.indexOf("", 1）;
//EndPos = html.indexOf("",1）;
return html.substring(StartPos）;
}

/**
* 將得到的html頁面轉(zhuǎn)換成DOM文檔
*/
public static Document getDocument(String url)
{
String html = getRequestPage(url）;
ByteArrayInputStream reader = new ByteArrayInputStream(html.getBytes()）;
Tidy tidy = new Tidy(）;
// 將html文檔轉(zhuǎn)換成符合規(guī)范的Dom文檔
tidy.setXHTML(true）;
Document doc = null;
try
{
doc = tidy.parseDOM(reader, System.out）;
}
catch (Exception e)
{
e.printStackTrace(）;
}
return doc;
}

/**
* 創(chuàng)建Dom轉(zhuǎn)換對象
*
* @return
*/
public static Transformer newTransformer()
{
try
{
Transformer transformer = TransformerFactory.newInstance()
.newTransformer();
Properties properties = transformer.getOutputProperties();
properties.setProperty(OutputKeys.ENCODING, "GBK");
properties.setProperty(OutputKeys.METHOD, "xml");
properties.setProperty(OutputKeys.VERSION, "1.0");
properties.setProperty(OutputKeys.INDENT, "no");
transformer.setOutputProperties(properties);
return transformer;
}
catch (TransformerConfigurationException tce)
{
throw new RuntimeException(tce.getMessage());
}
}
/**
* 取得dom文檔相應的字符串
* @param args
*/
public String getStringByDocument(String url)
{
Document doc = this.getDocument(url);
//取得所有的節(jié)點
NodeList list = doc.getElementsByTagName("html"）;
Transformer transformer = newTransformer(）;
StringBuffer buffer = new StringBuffer(）;
//對于每個節(jié)點
for (int i = 0; i < list.getLength(）; i++)
{
StringWriter sw = new StringWriter(）;
Node node = list.item(i）;
try
{
//將節(jié)點轉(zhuǎn)換成字符串
transformer.transform(new DOMSource(node), new StreamResult(sw)）;
buffer.append(new String(sw.toString().getBytes("ISO8859-1"))）;
System.out.println(i + " : " + new String(sw.toString().getBytes("ISO8859-1"))）;
}
catch (Exception e)
{
e.printStackTrace(）;
}
}
return buffer.toString(）;
}

public static void main(String args[])
{
WebSplider web = new WebSplider(）;
}

}