内容采集软件(不得不新闻采集教程)
优采云 发布时间: 2021-10-31 15:17内容采集软件(不得不新闻采集教程)
由于这期间公司人手不足,这期间游戏厂商的专场活动也比较频繁,所以只好做个新闻小软件采集,还有一些其他网站的新闻在上我们的平台。. 我总结了新闻采集的要点:1、通过模拟http请求,请求页面内容2、通过正则表达式过滤页面内容,取出想要的部分。3、将数据整合成满足我们需求的数据。
模拟http请求,请求模拟http请求的页面内容。我不会在这里详细解释。如果你英文好,可以阅读官方文档:HttpWebRequest 我贴出我的模拟Http请求的助手。
///
/// 模拟Http请求
///
///
///
public static string GetHttpRequest(string url)
{
if (string.IsNullOrEmpty(url))
return string.Empty;
HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
request.Method = "GET";
request.ServicePoint.Expect100Continue = false;
StreamReader stream = null;
string responseValue = string.Empty;
try
{
stream = new StreamReader(request.GetResponse().GetResponseStream());
responseValue = stream.ReadToEnd();
}
catch
{
throw;
}
finally
{
request.GetResponse().GetResponseStream().Close(); stream.Close(); stream = null;
} return responseValue;
}
///
/// 模拟Http请求
///
///
///
public static byte[] GetHttpRequestStream(string url)
{
byte[] bytes = null;
StreamReader stream = null;
if (string.IsNullOrEmpty(url)) return bytes;
HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
request.CookieContainer = new CookieContainer();
CookieContainer cookie = request.CookieContainer;
request.Method = "GET";
request.ServicePoint.Expect100Continue = false;
//string responseValue = string.Empty;
try
{
stream = new StreamReader(request.GetResponse().GetResponseStream());
List lBtyes = new List();
while (stream.BaseStream.CanRead)
{
int result = stream.BaseStream.ReadByte();
if (result == -1) break;
lBtyes.Add((byte) result);
}
bytes = lBtyes.ToArray();
//
responseValue = stream.ReadToEnd();
}
catch
{
throw;
}
finally
{
request.GetResponse().GetResponseStream().Close();
stream.Close();
stream = null;
} return bytes;
}
///
/// 模拟Http请求
///
///
///
public static string GetHttpRequest(string url, Encoding ec)
{
if (string.IsNullOrEmpty(url)) return string.Empty;
HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
request.Method = "GET";
request.ServicePoint.Expect100Continue = false;
StreamReader stream = null;
string responseValue = string.Empty;
try
{
stream = new StreamReader(request.GetResponse().GetResponseStream(), ec);
responseValue = stream.ReadToEnd();
}
catch
{
throw;
}
finally
{
request.GetResponse().GetResponseStream().Close();
stream.Close();
stream = null;
}
return responseValue;
}
我的采集是完美官网的新闻,新闻链接:
请求的代码:
//str是请求的结果的html内容
string str = PostRegister.Tools.GetHttpRequest("http://sw.wanmei.com/news/gamenews/list.shtml");
通过正则表达式过滤html内容:
string reg = @"]*href=(""(?[^""]*)""|\'(?[^\']*)\'|(?[^\s>]*))[^>]*>(?[\s\S]*?)</a>";
MatchCollection mc = Regex.Matches(str, reg);
StringBuilder strTitle = new StringBuilder();
Dictionary titleUrlList = new Dictionary();
strTitle.Append("");
for (int i = 0; i < mc.Count; i++)
{
string href = mc[i].Groups["href"].Value;// 这是href内容
string text = mc[i].Groups["text"].Value;// 这是text内容,就是<a>这里的内容</a>
if (i < mc.Count - 1)
{
if (text.Contains("新闻") && mc[i + 1].Groups[0].Value.Contains("_blank") && mc[i + 1].Groups[0].Value.Contains("hidefocus") == false)
{
strTitle.Append("\'" + mc[i + 1].Groups["text"].Value + "\',");//拼接字符串,用于查询数据库使用
titleUrlList.Add(mc[i + 1].Groups["text"].Value, "http://sw.wanmei.com" + mc[i + 1].Groups["href"].Value);//新闻标题和新闻链接
}
}
}
DbClassLibrary.Spiders.CommonSpider commonSpider = new CommonSpider();
string allTitle = "";
if (strTitle.ToString() != "")
{
allTitle = strTitle.ToString().Substring(0, strTitle.ToString().Length - 1);//截掉拼接的字符串中的最后一个,号
List allNotExists = commonSpider.GetNotExistsNews(allTitle, 1);//查询数据库中不存在的新闻,gameID=1表示 圣王
for (int i = 0; i < allNotExists.Count; i++)
{
listBoxTtitle.Items.Add(allNotExists[i]); listBoxLink.Items.Add(titleUrlList[allNotExists[i]]);
}
}
lblResult.Text = "共发现" + listBoxTtitle.Items.Count + "条新数据";
if (listBoxTtitle.Items.Count > 0)
MessageBox.Show("啦啦啦,发现新数据!共发现" + listBoxTtitle.Items.Count + "条新数据");
else
{
MessageBox.Show("对不起,暂时没有发现官网有新数据");
}
通过上面的代码,我们过滤掉了所有符合条件的新闻标题和新闻链接。注:正则表达式的用法和解释请参考我的另一篇文章文章:以下是请求新闻链接,获取新闻主要内容。方法也很简单。贴出来,主要是获取article_txt的id内容:
///
/// 根据新闻详情页面url获取url主体内容
///
///
///
private string GetHtmlContent(string url)
{
string str = PostRegister.Tools.GetHttpRequest(url);
string reg = @"((?!)[\s\S]*?)";
MatchCollection mc = Regex.Matches(str, reg);
StringBuilder strHref = new StringBuilder();
StringBuilder strText = new StringBuilder();
StringBuilder strTemp = new StringBuilder();
for (int i = 0; i < mc.Count; i++)
{
strTemp.AppendFormat(mc[i].Groups[0].Value.Replace("/resources/JPG", "http://sw.wanmei.com/resources/JPG").Replace("/resources/jpg", "http://sw.wanmei.com/resources/jpg"));
return strTemp.ToString().Substring(0, strTemp.ToString().Length - 6).Replace("", "").Trim();
}
return "";
}
希望能帮到你,^_^