从网页抓取数据( ViewXpath)

优采云 发布时间: 2021-11-19 19:02

  从网页抓取数据(

ViewXpath)

  使用 Xpath 从网页中获取数据

  

  

   ///

/// 从官方网站中抓取产品信息存放在本地数据库中

///

///

public List GetlistProductMessage()

{

string html = GetProductsDescriptionsImage("http://www.grandcanyononepoint.com/products");

HtmlDocument document = new HtmlDocument();

document.LoadHtml(html);

HtmlNode rootNode = document.DocumentNode;

/*//*[@class='list-product']为元素的XPath标记实例,

* 表示所有使用class="list-product"的节点

*/

HtmlNodeCollection rootNodeList = rootNode.SelectNodes("//*[@class='list-product']");

List products = new List();

foreach (HtmlNode node in rootNodeList)

{

ProductMessage db_product = new ProductMessage();

HtmlDocument docu = new HtmlDocument();

docu.LoadHtml(node.InnerHtml);

HtmlNode ro = docu.DocumentNode;

db_product.Code = Formsub(ro.SelectSingleNode("//*[@style='float:right;']").InnerText);

string Code = db_product.Code;

List Productlist = ProductMessage.GetProductList(Code,"");

if (Productlist.Count>0)

{

db_product.Name = Formsub(ro.SelectSingleNode("//*[@style='float:left;']").InnerText);

/*获取a节点中href标签的属性值*/

db_product.ID = GetProductID(ro.SelectSingleNode("a").Attributes["href"].Value);

string descmationhtml = GetProductsDescriptionsImage("http://www.grandcanyononepoint.com/products/view/" + db_product.ID + "");

HtmlDocument descmationDo = new HtmlDocument();

descmationDo.LoadHtml(descmationhtml);

HtmlNode descmationNode = descmationDo.DocumentNode;

db_product.Descmation = Formsub(descmationNode.SelectSingleNode("//*[@class='product-desc']").InnerHtml).Replace("'", "");

if (descmationNode.SelectSingleNode("//*[@class='details-tile']") != null)

{

db_product.DepartingFrom = Formsub(descmationNode.SelectSingleNode("//*[@class='details-tile']").InnerHtml.Replace("Departing From", ""));

}

if (descmationNode.SelectSingleNode("//*[@class='details-tile details-list']") != null)

{

db_product.ProductHighlights = Formsub(descmationNode.SelectSingleNode("//*[@class='details-tile details-list']").InnerHtml.Replace("Product Highlights", "")).Replace("'", "");

}

#region

try

{

ProductMessage.UpdateWEBProductMessage(db_product.Descmation,db_product.DepartingFrom,db_product.ProductHighlights,db_product.Name,db_product.Code);

}

catch { }

#endregion

#region

if (descmationNode.SelectSingleNode("//*[@class='product-equip']") != null)

{

HtmlDocument DesmationEquipment = new HtmlDocument();

DesmationEquipment.LoadHtml(descmationNode.SelectSingleNode("//*[@class='product-equip']").InnerHtml);

HtmlNode EquipmentNode = DesmationEquipment.DocumentNode;

HtmlNodeCollection EquipmentNodes = EquipmentNode.SelectNodes("div");

List EquipmentString = new List();

foreach (HtmlNode equipment in EquipmentNodes)

{

EquipmentModel Equipment_model = new EquipmentModel();

Equipment_model.Name = equipment.Attributes["title"].Value;

Equipment_model.ImageUrl = "/Papillon/EquipmentImage/" + equipment.Attributes["title"].Value + ".png";

try

{

ProductMessage.InsertProductEquipment(db_product.ID, Equipment_model.Name, Equipment_model.ImageUrl);

}

catch { }

EquipmentString.Add(Equipment_model);

}

db_product.Equipment = EquipmentString;

}

#endregion

#region

if (descmationNode.SelectNodes("//*[@title='See full size image']") != null)

{

HtmlNodeCollection ImageNodes = descmationNode.SelectNodes("//*[@title='See full size image']");

List ImageString = new List();

foreach (HtmlNode imagenode in ImageNodes)

{

ImageModel image_model = new ImageModel();

HtmlDocument imageDo = new HtmlDocument();

imageDo.LoadHtml(imagenode.InnerHtml);

HtmlNode imgRo = imageDo.DocumentNode;

//原图片地址

string FromPath = "http://www.grandcanyononepoint.com" + imgRo.SelectSingleNode("img").Attributes["src"].Value;

image_model.ImageUrl = FromPath;

try

{

ProductMessage.InsertProductImage(db_product.ID, image_model.ImageUrl);

}

catch { }

}

}

#endregion

products.Add(db_product);

}

}

return products;

}

  查看代码

  Xpath使用html作为类似于xml的格式,通过节点的不同标签获取不同的内容,可以从网页中获取想要的数据,这与网络爬虫不同。

  发布 @ 2016-07-29 16:59ly77461 阅读(2286)评论(0)编辑

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线