内容采集软件(不得不新闻采集教程)

优采云 发布时间: 2021-10-31 15:17

  内容采集软件(不得不新闻采集教程)

  由于这期间公司人手不足,这期间游戏厂商的专场活动也比较频繁,所以只好做个新闻小软件采集,还有一些其他网站的新闻在上我们的平台。. 我总结了新闻采集的要点:1、通过模拟http请求,请求页面内容2、通过正则表达式过滤页面内容,取出想要的部分。3、将数据整合成满足我们需求的数据。

  模拟http请求,请求模拟http请求的页面内容。我不会在这里详细解释。如果你英文好,可以阅读官方文档:HttpWebRequest 我贴出我的模拟Http请求的助手。

   ///

/// 模拟Http请求

///

///

///

public static string GetHttpRequest(string url)

{

if (string.IsNullOrEmpty(url))

return string.Empty;

HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;

request.Method = "GET";

request.ServicePoint.Expect100Continue = false;

StreamReader stream = null;

string responseValue = string.Empty;

try

{

stream = new StreamReader(request.GetResponse().GetResponseStream());

responseValue = stream.ReadToEnd();

}

catch

{

throw;

}

finally

{

request.GetResponse().GetResponseStream().Close(); stream.Close(); stream = null;

} return responseValue;

}

///

/// 模拟Http请求

///

///

///

public static byte[] GetHttpRequestStream(string url)

{

byte[] bytes = null;

StreamReader stream = null;

if (string.IsNullOrEmpty(url)) return bytes;

HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;

request.CookieContainer = new CookieContainer();

CookieContainer cookie = request.CookieContainer;

request.Method = "GET";

request.ServicePoint.Expect100Continue = false;

//string responseValue = string.Empty;

try

{

stream = new StreamReader(request.GetResponse().GetResponseStream());

List lBtyes = new List();

while (stream.BaseStream.CanRead)

{

int result = stream.BaseStream.ReadByte();

if (result == -1) break;

lBtyes.Add((byte) result);

}

bytes = lBtyes.ToArray();

//

responseValue = stream.ReadToEnd();

}

catch

{

throw;

}

finally

{

request.GetResponse().GetResponseStream().Close();

stream.Close();

stream = null;

} return bytes;

}

///

/// 模拟Http请求

///

///

///

public static string GetHttpRequest(string url, Encoding ec)

{

if (string.IsNullOrEmpty(url)) return string.Empty;

HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;

request.Method = "GET";

request.ServicePoint.Expect100Continue = false;

StreamReader stream = null;

string responseValue = string.Empty;

try

{

stream = new StreamReader(request.GetResponse().GetResponseStream(), ec);

responseValue = stream.ReadToEnd();

}

catch

{

throw;

}

finally

{

request.GetResponse().GetResponseStream().Close();

stream.Close();

stream = null;

}

return responseValue;

}

  我的采集是完美官网的新闻,新闻链接:

  请求的代码:

  //str是请求的结果的html内容

string str = PostRegister.Tools.GetHttpRequest("http://sw.wanmei.com/news/gamenews/list.shtml");

  通过正则表达式过滤html内容:

  string reg = @"]*href=(""(?[^""]*)""|\'(?[^\']*)\'|(?[^\s>]*))[^>]*>(?[\s\S]*?)</a>";

MatchCollection mc = Regex.Matches(str, reg);

StringBuilder strTitle = new StringBuilder();

Dictionary titleUrlList = new Dictionary();

strTitle.Append("");

for (int i = 0; i < mc.Count; i++)

{

string href = mc[i].Groups["href"].Value;// 这是href内容

string text = mc[i].Groups["text"].Value;// 这是text内容,就是<a>这里的内容</a>

if (i < mc.Count - 1)

{

if (text.Contains("新闻") && mc[i + 1].Groups[0].Value.Contains("_blank") && mc[i + 1].Groups[0].Value.Contains("hidefocus") == false)

{

strTitle.Append("\'" + mc[i + 1].Groups["text"].Value + "\',");//拼接字符串,用于查询数据库使用

titleUrlList.Add(mc[i + 1].Groups["text"].Value, "http://sw.wanmei.com" + mc[i + 1].Groups["href"].Value);//新闻标题和新闻链接

}

}

}

DbClassLibrary.Spiders.CommonSpider commonSpider = new CommonSpider();

string allTitle = "";

if (strTitle.ToString() != "")

{

allTitle = strTitle.ToString().Substring(0, strTitle.ToString().Length - 1);//截掉拼接的字符串中的最后一个,号

List allNotExists = commonSpider.GetNotExistsNews(allTitle, 1);//查询数据库中不存在的新闻,gameID=1表示 圣王

for (int i = 0; i < allNotExists.Count; i++)

{

listBoxTtitle.Items.Add(allNotExists[i]); listBoxLink.Items.Add(titleUrlList[allNotExists[i]]);

}

}

lblResult.Text = "共发现" + listBoxTtitle.Items.Count + "条新数据";

if (listBoxTtitle.Items.Count > 0)

MessageBox.Show("啦啦啦,发现新数据!共发现" + listBoxTtitle.Items.Count + "条新数据");

else

{

MessageBox.Show("对不起,暂时没有发现官网有新数据");

}

  通过上面的代码,我们过滤掉了所有符合条件的新闻标题和新闻链接。注:正则表达式的用法和解释请参考我的另一篇文章文章:以下是请求新闻链接,获取新闻主要内容。方法也很简单。贴出来,主要是获取article_txt的id内容:

   ///

/// 根据新闻详情页面url获取url主体内容

///

///

///

private string GetHtmlContent(string url)

{

string str = PostRegister.Tools.GetHttpRequest(url);

string reg = @"((?!)[\s\S]*?)";

MatchCollection mc = Regex.Matches(str, reg);

StringBuilder strHref = new StringBuilder();

StringBuilder strText = new StringBuilder();

StringBuilder strTemp = new StringBuilder();

for (int i = 0; i < mc.Count; i++)

{

strTemp.AppendFormat(mc[i].Groups[0].Value.Replace("/resources/JPG", "http://sw.wanmei.com/resources/JPG").Replace("/resources/jpg", "http://sw.wanmei.com/resources/jpg"));

return strTemp.ToString().Substring(0, strTemp.ToString().Length - 6).Replace("", "").Trim();

}

return "";

}

  希望能帮到你,^_^

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线