c#抓取网页数据(实现效果抓取行政区划代码地址:解决思路分析页面数据规则)
优采云 发布时间: 2021-12-19 09:08c#抓取网页数据(实现效果抓取行政区划代码地址:解决思路分析页面数据规则)
达到效果
抢行政区划代码
地址:
解决方案
分析页面数据规则,模拟请求获取页面内容,定时过滤,数据存储,递归
代码示例创建表、添加、删除
public void CreateDB(string tableName)
{
string strSql = "IF EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'dbo." + tableName
+ "') AND type in (N'U')) DROP TABLE dbo." + tableName;
string strC = " CREATE TABLE dbo." + tableName + "([id] [varchar](50) NULL,[type] [varchar](50) NULL,[name] [nvarchar](50) NULL,[pid] [varchar](50) NULL,[url] [varchar](100) NULL) ON [PRIMARY]";
SQLHelper sqlh = new SQLHelper();
sqlh.ExecuteSQLNonQuery(strSql);
sqlh.ExecuteSQLNonQuery(strC);
}
public void InsertDB(Region r, string tableName)
{
string strSql = "INSERT INTO " + tableName + " ([id],[type],[name],[pid],[url]) VALUES('" + r.id
+ "','" + r.type + "','" + r.name + "','" + r.pid + "','" + r.url + "')";
SQLHelper sqlh = new SQLHelper();
sqlh.ExecuteSQLNonQuery(strSql);
}
public void DeleteDB()
{
string strSql = "delete from [GetRegion] ";
SQLHelper sqlh = new SQLHelper();
sqlh.ExecuteSQLNonQuery(strSql);
}
数据抓取
///
/// 模拟请求返回数据
///
/// 网页地址
/// POST、GET
/// UTF-8、gb2312
/// 网页内容
public string Send(string strUrl, string strType, string strEncoding)
{
HttpWebRequest httpReq = (HttpWebRequest)HttpWebRequest.Create(strUrl); ////创建request请求
if (httpReq == null)
{
throw new ApplicationException(string.Format("Invalid url string: {0}", strUrl));
}
httpReq.Method = strType;
httpReq.Timeout = 1000 * 30;
//设置请求方式
HttpWebResponse httpRes = (HttpWebResponse)httpReq.GetResponse(); ////返回response数据
Stream myRequestStream = httpRes.GetResponseStream(); ////取得内容
StreamReader myStreamRead = new StreamReader(myRequestStream, Encoding.GetEncoding(strEncoding)); ////读取流
string strdata = string.Empty;
strdata = myStreamRead.ReadToEnd();
myStreamRead.Close();
myRequestStream.Close();
return strdata;
}
递归
<p> public void GetRegion(string strUrl, string strPid, string tableName)
{
string strHtml = Send(strUrl, "GET", "gb2312");
string strTable = AnalyzeHtml(strHtml);
//街道
if (strHtml.Contains("villagetable"))
{
Regex reg = new Regex(@"(?.*?)");
MatchCollection mc = reg.Matches(strTable);
for (int i = 0; i < (mc.Count - 1) / 3; i++)
{
Region r = new Region();
r.id = mc[i * 3 + 1].ToString().Replace("", "").Replace("", "");
r.type = mc[i * 3 + 2].ToString().Replace("", "").Replace("", "");
r.pid = strPid;
r.name = mc[i * 3 + 3].ToString().Replace("", "").Replace("", "");
//listR.Add(r);
InsertDB(r, tableName);
}
}
else
{
Regex reg = new Regex(@"(?is)]*?href=(['""]?)(?[^'""\s>]+)\1[^>]*>(?(?:(?!