c 抓取网页数据( 模拟登录获得cookie代理的设置利用方法模拟(图) )
优采云 发布时间: 2021-11-16 13:06c 抓取网页数据(
模拟登录获得cookie代理的设置利用方法模拟(图)
)
///
/// post请求获得页面
///
/// 需要获取的url
/// post的数据字符串,如id=1&name=test
/// 代理
/// coolie
/// 超时
///
public static string Crawl(string url, string postdata,WebProxy proxy, CookieContainer cookie, int timeout = 10000)
{
string result = string.Empty;
HttpWebRequest request = null;
WebResponse response = null;
StreamReader streamReader = null;
try
{
request = (HttpWebRequest)HttpWebRequest.Create(url);
request.Proxy = proxy;
request.Timeout = timeout;
request.AllowAutoRedirect = true;
request.CookieContainer = cookie;
byte[] bs = Encoding.ASCII.GetBytes(postdata);
string responseData = String.Empty;
request.Method = "POST";
request.ContentType = "application/x-www-form-urlencoded";
request.ContentLength = bs.Length;
using (Stream reqStream = request.GetRequestStream())
{
reqStream.Write(bs, 0, bs.Length);
reqStream.Close();
}
response = (HttpWebResponse)request.GetResponse();
streamReader = new StreamReader(response.GetResponseStream(), Encoding.UTF8);
result = streamReader.ReadToEnd();
}
catch (Exception ex)
{
throw ex;
}
finally
{
if (request != null)
{
request.Abort();
}
if (response != null)
{
response.Close();
}
if (streamReader != null)
{
streamReader.Dispose();
}
}
return result;
}
模拟登录获取cookie内容
首先找到登录页面,分析登录页面的post参数和链接,获取cookie后直接传给上面的方法
///
///根据模拟请求页面获得cookie
///
/// 模拟的url
/// cookie
public static CookieContainer GetCookie(string url, WebProxy proxy, int timeout = 10000)
{
HttpWebRequest request = null;
HttpWebResponse response = null;
try
{
CookieContainer cc = new CookieContainer();
request = (HttpWebRequest)HttpWebRequest.Create(url);
request.Proxy = proxy;
request.Timeout = timeout;
request.AllowAutoRedirect = true;
request.CookieContainer = cc;
response = (HttpWebResponse)request.GetResponse();
response.Cookies = request.CookieContainer.GetCookies(request.RequestUri);
return cc;
}
catch (Exception ex)
{
throw ex;
}
finally
{
if (request != null)
{
request.Abort();
}
if (response != null)
{
response.Close();
}
}
}
模拟登录获取cookie字符串
///
/// 获得cookie字符串,webbrowser可以使用
///
///
///
///
///
public static string GetCookieString(string url, WebProxy proxy, int timeout = 10000)
{
HttpWebRequest request = null;
HttpWebResponse response = null;
try
{
CookieContainer cc = new CookieContainer();
request = (HttpWebRequest)HttpWebRequest.Create(url);
request.Proxy = proxy;
request.Timeout = timeout;
request.AllowAutoRedirect = true;
request.CookieContainer = cc;
response = (HttpWebResponse)request.GetResponse();
response.Cookies = request.CookieContainer.GetCookies(request.RequestUri);
string strcrook = request.CookieContainer.GetCookieHeader(request.RequestUri);
return strcrook;
}
catch (Exception ex)
{
throw ex;
}
finally
{
if (request != null)
{
request.Abort();
}
if (response != null)
{
response.Close();
}
}
}
代理设置
///
/// 创建代理
///
/// 代理端口
/// 用户名
/// 密码
///
public static WebProxy CreatePorxy(string port, string user, string password)
{
WebProxy proxy = new WebProxy();
proxy.Address = new Uri(port);
proxy.Credentials = new NetworkCredential(user, password);
return proxy;
}
使用webbrowser获取js生成的页面
注意:由于不知道页面什么时候执行,这里是等待5s,默认执行完成,效率有待提高。
额外执行需要线程安全添加[STAThread]
///
/// 抓取js生成的页面
///
///
///
public static string CrawlDynamic(string url)
{
WebBrowser browser = new WebBrowser();
browser.ScriptErrorsSuppressed = true;
browser.Navigate(url);
//先要等待加载完毕
while (browser.ReadyState != WebBrowserReadyState.Complete)
{
Application.DoEvents();
}
System.Timers.Timer timer = new System.Timers.Timer();
var isComplete = false;
timer.Elapsed += new System.Timers.ElapsedEventHandler((sender, e) =>
{
//加载完毕
isComplete = true;
timer.Stop();
});
timer.Interval = 1000 * 5;
timer.Start();
//继续等待 5s,等待js加载完
while (!isComplete)
Application.DoEvents();
var htmldocument = browser.Document;
return htmldocument.ActiveElement.InnerHtml;
}
为网页浏览器设置 cookie 以模拟登录
一开始没成功,觉得这个方法不能用。后来发现是doain设置的问题。我的例子是该设置可用。这个地方可能需要根据自己的情况选择域名。
[DllImport("wininet.dll", CharSet = CharSet.Auto, SetLastError = true)]
public static extern bool InternetSetCookie(string lpszUrlName, string lbszCookieName, string lpszCookieData);
///
/// 为webbrowser设置cookie
///
/// cookie字符串,可以从上面方法获得
/// 需要设置的域名
public static void SetCookie(string cookieStr,string domain)
{
foreach (string c in cookieStr.Split(';'))
{
string[] item = c.Split('=');
if (item.Length == 2)
{
string name = item[0];
string value = item[1];
InternetSetCookie(domain, name, value);
}
}
}
使用演示
//代理,没有就直接传null
WebProxy proxy = WebCrawl.WebRequestHelper.CreatePorxy("xx.com", "user", "password");
//根据登录页得到cookie
CookieContainer cookie = WebCrawl.WebRequestHelper.GetCookie("http://xxxx.login.com", proxy);
//获取页面
string html = WebCrawl.WebRequestHelper.Crawl("http://xxx.index.com", proxy, cookie);
//根据登录页得到cookie字符串
string cookiestr = WebCrawl.WebRequestHelper.GetCookieString("http://xxxx.login.com", proxy);
//为webbrowser设置cookie
WebCrawl.WebRequestHelper.SetCookie(cookiestr, "https://xx.com");
//获取需要登录切用js生成的页面,当然普通页面也可以
string htmlWithJs = WebCrawl.WebRequestHelper.CrawlDynamic("http://xxx.index.com");