c#抓取网页数据(实现效果抓取行政区划代码地址:解决思路分析页面数据规则)

优采云 发布时间: 2021-12-19 09:08

  c#抓取网页数据(实现效果抓取行政区划代码地址:解决思路分析页面数据规则)

  达到效果

  抢行政区划代码

  地址:

  解决方案

  分析页面数据规则,模拟请求获取页面内容,定时过滤,数据存储,递归

  代码示例创建表、添加、删除

   public void CreateDB(string tableName)

{

string strSql = "IF EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'dbo." + tableName

+ "') AND type in (N'U')) DROP TABLE dbo." + tableName;

string strC = " CREATE TABLE dbo." + tableName + "([id] [varchar](50) NULL,[type] [varchar](50) NULL,[name] [nvarchar](50) NULL,[pid] [varchar](50) NULL,[url] [varchar](100) NULL) ON [PRIMARY]";

SQLHelper sqlh = new SQLHelper();

sqlh.ExecuteSQLNonQuery(strSql);

sqlh.ExecuteSQLNonQuery(strC);

}

public void InsertDB(Region r, string tableName)

{

string strSql = "INSERT INTO " + tableName + " ([id],[type],[name],[pid],[url]) VALUES('" + r.id

+ "','" + r.type + "','" + r.name + "','" + r.pid + "','" + r.url + "')";

SQLHelper sqlh = new SQLHelper();

sqlh.ExecuteSQLNonQuery(strSql);

}

public void DeleteDB()

{

string strSql = "delete from [GetRegion] ";

SQLHelper sqlh = new SQLHelper();

sqlh.ExecuteSQLNonQuery(strSql);

}

  数据抓取

   ///

/// 模拟请求返回数据

///

/// 网页地址

/// POST、GET

/// UTF-8、gb2312

/// 网页内容

public string Send(string strUrl, string strType, string strEncoding)

{

HttpWebRequest httpReq = (HttpWebRequest)HttpWebRequest.Create(strUrl); ////创建request请求

if (httpReq == null)

{

throw new ApplicationException(string.Format("Invalid url string: {0}", strUrl));

}

httpReq.Method = strType;

httpReq.Timeout = 1000 * 30;

//设置请求方式

HttpWebResponse httpRes = (HttpWebResponse)httpReq.GetResponse(); ////返回response数据

Stream myRequestStream = httpRes.GetResponseStream(); ////取得内容

StreamReader myStreamRead = new StreamReader(myRequestStream, Encoding.GetEncoding(strEncoding)); ////读取流

string strdata = string.Empty;

strdata = myStreamRead.ReadToEnd();

myStreamRead.Close();

myRequestStream.Close();

return strdata;

}

  递归

<p> public void GetRegion(string strUrl, string strPid, string tableName)

{

string strHtml = Send(strUrl, "GET", "gb2312");

string strTable = AnalyzeHtml(strHtml);

//街道

if (strHtml.Contains("villagetable"))

{

Regex reg = new Regex(@"(?.*?)");

MatchCollection mc = reg.Matches(strTable);

for (int i = 0; i < (mc.Count - 1) / 3; i++)

{

Region r = new Region();

r.id = mc[i * 3 + 1].ToString().Replace("", "").Replace("", "");

r.type = mc[i * 3 + 2].ToString().Replace("", "").Replace("", "");

r.pid = strPid;

r.name = mc[i * 3 + 3].ToString().Replace("", "").Replace("", "");

//listR.Add(r);

InsertDB(r, tableName);

}

}

else

{

Regex reg = new Regex(@"(?is)]*?href=(['""]?)(?[^'""\s>]+)\1[^>]*>(?(?:(?!

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线