从网页抓取数据( ViewXpath)
优采云 发布时间: 2021-11-19 19:02从网页抓取数据(
ViewXpath)
使用 Xpath 从网页中获取数据
///
/// 从官方网站中抓取产品信息存放在本地数据库中
///
///
public List GetlistProductMessage()
{
string html = GetProductsDescriptionsImage("http://www.grandcanyononepoint.com/products");
HtmlDocument document = new HtmlDocument();
document.LoadHtml(html);
HtmlNode rootNode = document.DocumentNode;
/*//*[@class='list-product']为元素的XPath标记实例,
* 表示所有使用class="list-product"的节点
*/
HtmlNodeCollection rootNodeList = rootNode.SelectNodes("//*[@class='list-product']");
List products = new List();
foreach (HtmlNode node in rootNodeList)
{
ProductMessage db_product = new ProductMessage();
HtmlDocument docu = new HtmlDocument();
docu.LoadHtml(node.InnerHtml);
HtmlNode ro = docu.DocumentNode;
db_product.Code = Formsub(ro.SelectSingleNode("//*[@style='float:right;']").InnerText);
string Code = db_product.Code;
List Productlist = ProductMessage.GetProductList(Code,"");
if (Productlist.Count>0)
{
db_product.Name = Formsub(ro.SelectSingleNode("//*[@style='float:left;']").InnerText);
/*获取a节点中href标签的属性值*/
db_product.ID = GetProductID(ro.SelectSingleNode("a").Attributes["href"].Value);
string descmationhtml = GetProductsDescriptionsImage("http://www.grandcanyononepoint.com/products/view/" + db_product.ID + "");
HtmlDocument descmationDo = new HtmlDocument();
descmationDo.LoadHtml(descmationhtml);
HtmlNode descmationNode = descmationDo.DocumentNode;
db_product.Descmation = Formsub(descmationNode.SelectSingleNode("//*[@class='product-desc']").InnerHtml).Replace("'", "");
if (descmationNode.SelectSingleNode("//*[@class='details-tile']") != null)
{
db_product.DepartingFrom = Formsub(descmationNode.SelectSingleNode("//*[@class='details-tile']").InnerHtml.Replace("Departing From", ""));
}
if (descmationNode.SelectSingleNode("//*[@class='details-tile details-list']") != null)
{
db_product.ProductHighlights = Formsub(descmationNode.SelectSingleNode("//*[@class='details-tile details-list']").InnerHtml.Replace("Product Highlights", "")).Replace("'", "");
}
#region
try
{
ProductMessage.UpdateWEBProductMessage(db_product.Descmation,db_product.DepartingFrom,db_product.ProductHighlights,db_product.Name,db_product.Code);
}
catch { }
#endregion
#region
if (descmationNode.SelectSingleNode("//*[@class='product-equip']") != null)
{
HtmlDocument DesmationEquipment = new HtmlDocument();
DesmationEquipment.LoadHtml(descmationNode.SelectSingleNode("//*[@class='product-equip']").InnerHtml);
HtmlNode EquipmentNode = DesmationEquipment.DocumentNode;
HtmlNodeCollection EquipmentNodes = EquipmentNode.SelectNodes("div");
List EquipmentString = new List();
foreach (HtmlNode equipment in EquipmentNodes)
{
EquipmentModel Equipment_model = new EquipmentModel();
Equipment_model.Name = equipment.Attributes["title"].Value;
Equipment_model.ImageUrl = "/Papillon/EquipmentImage/" + equipment.Attributes["title"].Value + ".png";
try
{
ProductMessage.InsertProductEquipment(db_product.ID, Equipment_model.Name, Equipment_model.ImageUrl);
}
catch { }
EquipmentString.Add(Equipment_model);
}
db_product.Equipment = EquipmentString;
}
#endregion
#region
if (descmationNode.SelectNodes("//*[@title='See full size image']") != null)
{
HtmlNodeCollection ImageNodes = descmationNode.SelectNodes("//*[@title='See full size image']");
List ImageString = new List();
foreach (HtmlNode imagenode in ImageNodes)
{
ImageModel image_model = new ImageModel();
HtmlDocument imageDo = new HtmlDocument();
imageDo.LoadHtml(imagenode.InnerHtml);
HtmlNode imgRo = imageDo.DocumentNode;
//原图片地址
string FromPath = "http://www.grandcanyononepoint.com" + imgRo.SelectSingleNode("img").Attributes["src"].Value;
image_model.ImageUrl = FromPath;
try
{
ProductMessage.InsertProductImage(db_product.ID, image_model.ImageUrl);
}
catch { }
}
}
#endregion
products.Add(db_product);
}
}
return products;
}
查看代码
Xpath使用html作为类似于xml的格式,通过节点的不同标签获取不同的内容,可以从网页中获取想要的数据,这与网络爬虫不同。
发布 @ 2016-07-29 16:59ly77461 阅读(2286)评论(0)编辑