java爬虫抓取动态网页(【】动态cookie页面应该怎么设置才能抓取数据? )
优采云 发布时间: 2021-10-02 15:32java爬虫抓取动态网页(【】动态cookie页面应该怎么设置才能抓取数据?
)
我需要抓取收录数据的网页内容,并使用httpurlconnection抓取网页内容(代码片段1)。但是,需要爬网的网页似乎在JS端设置了动态cookie,导致爬网的内容成为代码片段2。现在请帮助我了解如何设置这样的动态cookie页面来爬网数据?请给我发一封针对特定链接地址的私人信件~
<br />
/**<br />
* 代码段1<br />
*/<br />
import java.io.BufferedReader;<br />
import java.io.IOException;<br />
import java.io.InputStream;<br />
import java.io.InputStreamReader;<br />
import java.net.CookieHandler;<br />
import java.net.CookieManager;<br />
import java.net.CookiePolicy;<br />
import java.net.HttpURLConnection;<br />
import java.net.URL;<br />
public class GetWebContent {<br />
public static String GetWebContent(String urlString, final String charset,int timeout) throws IOException {<br />
if (urlString == null || urlString.length() == 0) {<br />
return null;<br />
}<br />
urlString = (urlString.startsWith("http://") || urlString<br />
.startsWith("https://")) ? urlString : ("http://" + urlString)<br />
.intern();<br />
CookieHandler.setDefault(new CookieManager(null,<br />
CookiePolicy.ACCEPT_ALL));<br />
URL url = new URL(urlString);<br />
HttpURLConnection conn = (HttpURLConnection) url.openConnection();<br />
conn.setDoOutput(true);<br />
conn.setRequestProperty("Pragma", "no-cache");<br />
conn.setRequestProperty("Cache-Control", "no-cache");<br />
// http://blog.csdn.net/yjflinchong<br />
int temp = Integer.parseInt(Math.round(Math.random() * 2) + "");<br />
conn.setRequestProperty("User-Agent", UserAgent[temp]); // 模拟手机系统<br />
System.out.println(UserAgent[temp]);<br />
conn.setRequestProperty("Accept",<br />
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");// 只接受text/html类型,当然也可以接受图片,pdf,*/*任意,就是tomcat/conf/web里面定义那些<br />
conn.setConnectTimeout(timeout);<br />
<br />
try {<br />
if (conn.getResponseCode() != HttpURLConnection.HTTP_OK) {<br />
return null;<br />
}<br />
} catch (Exception e) {<br />
try {<br />
e.printStackTrace();<br />
} catch (Exception e2) {<br />
e2.printStackTrace();<br />
}<br />
return null;<br />
}<br />
InputStream input = conn.getInputStream();<br />
BufferedReader reader = new BufferedReader(new InputStreamReader(input,<br />
charset));<br />
String line = null;<br />
StringBuffer sb = new StringBuffer();<br />
while ((line = reader.readLine()) != null) {<br />
sb.append(line).append("\r\n");<br />
}<br />
<br />
if (reader != null) {<br />
reader.close();<br />
}<br />
if (conn != null) {<br />
conn.disconnect();<br />
}<br />
return sb.toString();<br />
}<br />
<br />
public static String[] UserAgent = {<br />
//"Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.2",<br />
"Mozilla/5.0 (iPad; U; CPU OS 9_2_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B500 Safari/531.21.11",<br />
<br />
// http://blog.csdn.net/yjflinchong<br />
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",//win10 chrome<br />
"Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1" };//iphone X<br />
}
代码段2,抓取从网页获取的数据
<br />
<br />
<br />
<br />
<br />
<br />
var data = {<br />
'cookie' : "6a947161cfdda35d01f8c6c673653ffc",<br />
'uri' : "此处地址即为需要抓取的地址,如有需要请私信我"<br />
};<br />
<br />
function getCookie(c_name){<br />
if (document.cookie.length>0){ <br />
c_start=document.cookie.indexOf(c_name + "=")<br />
if (c_start!=-1){ <br />
c_start=c_start + c_name.length+1 <br />
c_end=document.cookie.indexOf(";",c_start)<br />
if (c_end==-1) c_end=document.cookie.length<br />
return unescape(document.cookie.substring(c_start,c_end))<br />
} <br />
}<br />
return "";<br />
}<br />
<br />
function setCookie(c_name,value,expiredays){<br />
var exdate=new Date();<br />
exdate.setDate(exdate.getDate()+expiredays);<br />
document.cookie=c_name+ "=" +escape(value)+((expiredays==null) ? "" : "; path=/; expires="+exdate.toGMTString());<br />
}<br />
<br />
function jump(){<br />
setCookie('elvaeye',data['cookie'],365);<br />
window.location = data['uri'];<br />
}<br />
<br />
<br />
<br />
<br />
<br />