网页源代码抓取工具(就是自动判断网页编码格式,解决乱码问题(图) )
优采云 发布时间: 2021-10-19 04:07网页源代码抓取工具(就是自动判断网页编码格式,解决乱码问题(图)
)
拿到代码后,首先需要从网上下载一个HTMLParser.NET。最高版本是2003,记住你的VS版本一定不能是2010的,2005和2008都可以。找到动态链接库,然后添加对项目的引用就OK了。
源代码:
<p>using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.IO;
using System.Net;
using Winista.Text.HtmlParser;
using Winista.Text.HtmlParser.Lex;
using Winista.Text.HtmlParser.Visitors;
namespace 抓取网页源代码
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
//textBox1.Text = "http://imgsrc.baidu.com/baike/pic/item/d8b8c92ab6004017d42af1b1.jpg";//抓取文件放到指定地点
}
private void button1_Click(object sender, EventArgs e)
{
//抓取文件放到指定的地点
//try
//{
// //WebClient webclient=new WebClient ();
// WebClient web = new WebClient();
// web.DownloadFile(textBox1.Text, "c://1.jpg");
//}
//catch (Exception ex)
//{
// MessageBox.Show(ex.Message);
//}
textBox2.Text = downhtml_1(textBox1.Text );
}
private string downhtml_1(string WebUrl)//抓取网页源代码方法一
{
string htmlText = "";
try
{
WebClient myWebClient =new WebClient();
myWebClient.Encoding = System.Text.Encoding.Default;//获取和设置用于上载和下载字符串的encoding,默认值是default
//myWebClient.Encoding = System.Text.Encoding.Default;
htmlText = myWebClient.DownloadString(WebUrl );//将下载的资源付给字符串
}
catch (Exception ex)
{
MessageBox.Show(ex.Message);
}
if (htmlText.Trim() == "")
htmlText = "失败!";
return htmlText;
}
//private WebClient WebClient()
//{
// throw new NotImplementedException();
//}
private string downhtml_2(string WebUrl)//抓取网页源代码方法二
{
string responseData = "";
string string1 = "";
try
{
//httpWebRequest是webRequest的子类,httpWebRequest是基于http协议的 .
//HttpWebRequest 是 WebRequest 的实例化使用,单独的 WebRequest 是不能使用的
HttpWebRequest Req = (HttpWebRequest)WebRequest.Create(new System.Uri(WebUrl));
Req.UserAgent = "Mozilla/4.0(compatible;MSIE 6.0;Windows NT 5.0; .NET CLR 1.1.4322)";//声明了浏览器用于 HTTP 请求的用户代理头的值。
Req.Timeout = 30000;
//StreamReader responseReader = new StreamReader(Req.GetResponse().GetResponseStream(), Encoding.Default); //方法一
//方法二,建议使用二
HttpWebResponse response = (HttpWebResponse)Req.GetResponse();//决不要直接创建 HttpWebResponse 类的实例。而应当使用通过调用 HttpWebRequest.GetResponse 所返回的实例
Stream receiveStream = response.GetResponseStream();
StreamReader responseReader = new StreamReader(receiveStream, Encoding.Default);
response.Close();//记得调用close方法关闭 HttpWebResponse,释放连接,以重用
responseData = responseReader.ReadToEnd();
responseReader.Close();
//Lexer lexer = new Lexer(filterScript (responseData ));
//Parser parser=new Parser (lexer );//用一个URL或者string页面做一个parser
//TextExtractingVisitor textvisitor=new TextExtractingVisitor ();用这个parser做一个visitor
//parser .VisitAllNodesWith (textvisitor );//使用parser.visitallnudeswith(visitor)来遍历节点
//string1 =textvisitor .ExtractedText .ToString ();
string1=filterScript(responseData );
}
catch (Exception ex)
{
MessageBox.Show(ex.Message);
}
if (responseData.Trim() == "")
responseData = "失败!";
//return responseData;
return string1;
}
public string filterScript(string str1)//去除代码中的javascript
{
// string str1 = "new TextParser('/posts/05/B1/B3/9E/content_html.txt', 'content_tree');";
if (str1.Contains("