.net中 网页抓取数据(提取html中的数据,提取table中的数据)

  • Post author:
  • Post category:其他


方法一:

WebRequest request = WebRequest.Create(“http://www.cftea.com/”);

WebResponse response = request.GetResponse();

StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(“gb2312”));

var contents = reader.ReadToEnd();

Console.WriteLine(StripHTML(contents));

reader.Close();

reader.Dispose();

response.Close();

Console.Read();

方法二:(抓取html中table里面的数据)

string html = @”<html>

<head>

<title></title>

</head>

<body>

<table>a</table>

<table>b</table>

<table>c</table>

<table>d</table>

<table>e</table>

</body>

</html>

“;

var strReg = @”(?is)(?<=<table>).+?(?=</table>)”;

List<string> result = new List<string>();

MatchCollection mc = Regex.Matches(html, strReg);

foreach (Match m in mc)

{


//result.Add(m.Value);

Console.WriteLine(m.Value);

}

方法三:

var htmlDocument = new HtmlDocument();

htmlDocument.LoadHtml(orgStr);

var tables = htmlDocument.DocumentNode.SelectNodes(“//table”);//xpath的写法

foreach (var table in tables)

{




foreach (var tr in table.SelectNodes(“//tr”))



{




var collegeName = tr.SelectNodes(“//td”).Skip(1).FirstOrDefault().InnerText;

Console.WriteLine(collegeName);

}

}

相关的网址:


http://www.cnblogs.com/chuncn/archive/2009/09/07/1561564.html


http://www.cnblogs.com/xinzhyu/archive/2008/12/09/1351434.html


http://msdn.microsoft.com/zh-tw/ee787055.aspx

方法四:(对网页中table里面的数据提取)

#region http://www.gaokao.com/e/20120109/4f0a8e1773aa0.shtml

//var url = “http://www.gaokao.com/e/20120109/4f0a8e1773aa0.shtml”;

//var url = “http://www.gaokao.com/e/20120109/4f0a8e1773aa0_2.shtml”;

//var url = “http://www.gaokao.com/e/20120109/4f0a8e1773aa0_3.shtml”;

//var url = “http://www.gaokao.com/e/20120109/4f0a8e1773aa0_4.shtml”;

//var url = “http://www.gaokao.com/e/20120109/4f0a8e1773aa0_5.shtml”;

//var url = “http://www.gaokao.com/e/20120109/4f0a8e1773aa0_6.shtml”;

//var orgStr = ChinaEduSp.Crawl.HttpUtility.GetContentByUrl(url, “gb2312”);

//var htmlDocument = new HtmlDocument();

//htmlDocument.LoadHtml(orgStr);

//var rows = htmlDocument.DocumentNode.SelectNodes(“//table//tr”);

//foreach (var item in rows)

//{


//    var pos = item.SelectSingleNode(“td[1]”).InnerText;

//    var school = item.SelectSingleNode(“td[2]”).InnerText;

//    var province = item.SelectSingleNode(“td[3]”).InnerText;

//    var type = item.SelectSingleNode(“td[4]”).InnerText;

//    var totalScore = item.SelectSingleNode(“td[5]”).InnerText;

//    var seq = db.RankingDescriptions.Count();

//    //Response.Write(“名次:” + pos + ” 学校名称:” + school + ” 所在省份:” + province + ” 类型:” + type + ” 总分:” + totalScore + “/r/n”);

//    //Response.Write(“名次:” + pos + ” 学校名称:” + school);

//    try

//    {


//        db.RankingDescriptions.Add(new RankingDescription

//        {


//            POS = Convert.ToInt32(pos),

//            SchoolName = school,

//            Province = province,

//            Area = province,

//            Type = type,

//            TotalScore = totalScore,

//            IsShow = true,

//            IsDelete = false,

//            RankId = 0,

//            Seq = seq

//        });

//        db.SaveChanges();

//        seq++;

//    }

//    catch (Exception ex)

//    {


//        string msg = ex.Message;

//    }

//}

#endregion

#region http://www.gaokao.com/e/20120109/4f0a914934baa_2.shtml

//var url = “http://www.gaokao.com/e/20120109/4f0a914934baa_2.shtml”;

//var orgStr = ChinaEduSp.Crawl.HttpUtility.GetContentByUrl(url, “gb2312”);

//var htmlDocument = new HtmlDocument();

//htmlDocument.LoadHtml(orgStr);

//var rows = htmlDocument.DocumentNode.SelectNodes(“//table//tr//td//table//tr”);

//foreach (var item in rows)

//{


//    var pos = item.SelectSingleNode(“td[1]”).InnerText;

//    var school = item.SelectSingleNode(“td[2]”).InnerText;

//    var province = item.SelectSingleNode(“td[3]”).InnerText;

//    var totalScore = item.SelectSingleNode(“td[4]”).InnerText;

//    var seq = db.RankingDescriptions.Count();

//    //Response.Write(“名次:” + pos + ” 学校名称:” + school + ” 所在省份:” + province + ” 总分:” + totalScore + “/r/n”);

//    //Response.Write(“名次:” + pos + ” 学校名称:” + school);

//    try

//    {


//        db.RankingDescriptions.Add(new RankingDescription

//        {


//            POS = Convert.ToInt32(pos),

//            SchoolName = school,

//            Province = province,

//            Area = province,

//            TotalScore = totalScore,

//            IsShow = true,

//            IsDelete = false,

//            RankId = 0,

//            Seq = seq

//        });

//        db.SaveChanges();

//        seq++;

//    }

//    catch (Exception ex)

//    {


//        string msg = ex.Message;

//    }

//}

#endregion



版权声明:本文为TianGaojie123abc原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。