c 抓取网页数据( 模拟登录获得cookie代理的设置利用方法模拟(图) )

优采云 发布时间: 2021-11-16 13:06

  c 抓取网页数据(

模拟登录获得cookie代理的设置利用方法模拟(图)

)

   ///

/// post请求获得页面

///

/// 需要获取的url

/// post的数据字符串,如id=1&name=test

/// 代理

/// coolie

/// 超时

///

public static string Crawl(string url, string postdata,WebProxy proxy, CookieContainer cookie, int timeout = 10000)

{

string result = string.Empty;

HttpWebRequest request = null;

WebResponse response = null;

StreamReader streamReader = null;

try

{

request = (HttpWebRequest)HttpWebRequest.Create(url);

request.Proxy = proxy;

request.Timeout = timeout;

request.AllowAutoRedirect = true;

request.CookieContainer = cookie;

byte[] bs = Encoding.ASCII.GetBytes(postdata);

string responseData = String.Empty;

request.Method = "POST";

request.ContentType = "application/x-www-form-urlencoded";

request.ContentLength = bs.Length;

using (Stream reqStream = request.GetRequestStream())

{

reqStream.Write(bs, 0, bs.Length);

reqStream.Close();

}

response = (HttpWebResponse)request.GetResponse();

streamReader = new StreamReader(response.GetResponseStream(), Encoding.UTF8);

result = streamReader.ReadToEnd();

}

catch (Exception ex)

{

throw ex;

}

finally

{

if (request != null)

{

request.Abort();

}

if (response != null)

{

response.Close();

}

if (streamReader != null)

{

streamReader.Dispose();

}

}

return result;

}

  模拟登录获取cookie内容

  首先找到登录页面,分析登录页面的post参数和链接,获取cookie后直接传给上面的方法

   ///

///根据模拟请求页面获得cookie

///

/// 模拟的url

/// cookie

public static CookieContainer GetCookie(string url, WebProxy proxy, int timeout = 10000)

{

HttpWebRequest request = null;

HttpWebResponse response = null;

try

{

CookieContainer cc = new CookieContainer();

request = (HttpWebRequest)HttpWebRequest.Create(url);

request.Proxy = proxy;

request.Timeout = timeout;

request.AllowAutoRedirect = true;

request.CookieContainer = cc;

response = (HttpWebResponse)request.GetResponse();

response.Cookies = request.CookieContainer.GetCookies(request.RequestUri);

return cc;

}

catch (Exception ex)

{

throw ex;

}

finally

{

if (request != null)

{

request.Abort();

}

if (response != null)

{

response.Close();

}

}

}

  模拟登录获取cookie字符串

   ///

/// 获得cookie字符串,webbrowser可以使用

///

///

///

///

///

public static string GetCookieString(string url, WebProxy proxy, int timeout = 10000)

{

HttpWebRequest request = null;

HttpWebResponse response = null;

try

{

CookieContainer cc = new CookieContainer();

request = (HttpWebRequest)HttpWebRequest.Create(url);

request.Proxy = proxy;

request.Timeout = timeout;

request.AllowAutoRedirect = true;

request.CookieContainer = cc;

response = (HttpWebResponse)request.GetResponse();

response.Cookies = request.CookieContainer.GetCookies(request.RequestUri);

string strcrook = request.CookieContainer.GetCookieHeader(request.RequestUri);

return strcrook;

}

catch (Exception ex)

{

throw ex;

}

finally

{

if (request != null)

{

request.Abort();

}

if (response != null)

{

response.Close();

}

}

}

  代理设置

   ///

/// 创建代理

///

/// 代理端口

/// 用户名

/// 密码

///

public static WebProxy CreatePorxy(string port, string user, string password)

{

WebProxy proxy = new WebProxy();

proxy.Address = new Uri(port);

proxy.Credentials = new NetworkCredential(user, password);

return proxy;

}

  使用webbrowser获取js生成的页面

  注意:由于不知道页面什么时候执行,这里是等待5s,默认执行完成,效率有待提高。

  额外执行需要线程安全添加[STAThread]

   ///

/// 抓取js生成的页面

///

///

///

public static string CrawlDynamic(string url)

{

WebBrowser browser = new WebBrowser();

browser.ScriptErrorsSuppressed = true;

browser.Navigate(url);

//先要等待加载完毕

while (browser.ReadyState != WebBrowserReadyState.Complete)

{

Application.DoEvents();

}

System.Timers.Timer timer = new System.Timers.Timer();

var isComplete = false;

timer.Elapsed += new System.Timers.ElapsedEventHandler((sender, e) =>

{

//加载完毕

isComplete = true;

timer.Stop();

});

timer.Interval = 1000 * 5;

timer.Start();

//继续等待 5s,等待js加载完

while (!isComplete)

Application.DoEvents();

var htmldocument = browser.Document;

return htmldocument.ActiveElement.InnerHtml;

}

  为网页浏览器设置 cookie 以模拟登录

  一开始没成功,觉得这个方法不能用。后来发现是doain设置的问题。我的例子是该设置可用。这个地方可能需要根据自己的情况选择域名。

   [DllImport("wininet.dll", CharSet = CharSet.Auto, SetLastError = true)]

public static extern bool InternetSetCookie(string lpszUrlName, string lbszCookieName, string lpszCookieData);

///

/// 为webbrowser设置cookie

///

/// cookie字符串,可以从上面方法获得

/// 需要设置的域名

public static void SetCookie(string cookieStr,string domain)

{

foreach (string c in cookieStr.Split(';'))

{

string[] item = c.Split('=');

if (item.Length == 2)

{

string name = item[0];

string value = item[1];

InternetSetCookie(domain, name, value);

}

}

}

  使用演示

   //代理,没有就直接传null

WebProxy proxy = WebCrawl.WebRequestHelper.CreatePorxy("xx.com", "user", "password");

//根据登录页得到cookie

CookieContainer cookie = WebCrawl.WebRequestHelper.GetCookie("http://xxxx.login.com", proxy);

//获取页面

string html = WebCrawl.WebRequestHelper.Crawl("http://xxx.index.com", proxy, cookie);

//根据登录页得到cookie字符串

string cookiestr = WebCrawl.WebRequestHelper.GetCookieString("http://xxxx.login.com", proxy);

//为webbrowser设置cookie

WebCrawl.WebRequestHelper.SetCookie(cookiestr, "https://xx.com");

//获取需要登录切用js生成的页面,当然普通页面也可以

string htmlWithJs = WebCrawl.WebRequestHelper.CrawlDynamic("http://xxx.index.com");

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线