首选需要引用两个依赖的类库
1、HtmlAgilityPack 1.8.4.0
2、CsharpHttpHelper(苏飞论坛原创)
抓取tianyancha代码如下:
/// <summary>
/// 获取企业信息
/// </summary>
/// <param name="conmpany">企业关键字</param>
/// <returns></returns>
private static string Test21(string conmpany)
{
DateTime startdt = DateTime.Now;
HttpHelper http = new HttpHelper();
HttpItem item = new HttpItem()
{
URL = "https://www.tianyancha.com/search?key=" + conmpany,//URL 必需项
Method = "GET",//URL 可选项 默认为Get
Timeout = 100000,//连接超时时间 可选项默认为100000
ReadWriteTimeout = 30000,//写入Post数据超时时间 可选项默认为30000
IsToLower = false,//得到的HTML代码是否转成小写 可选项默认转小写
Cookie = "",//字符串Cookie 可选项
UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0",//用户的浏览器类型,版本,操作系统 可选项有默认值
Accept = "text/html, application/xhtml+xml, */*",// 可选项有默认值
ContentType = "text/html",//返回类型 可选项有默认值
Referer = "https://www.tianyancha.com",//来源URL 可选项
Allowautoredirect = false,//是否根据301跳转 可选项
AutoRedirectCookie = false,//是否自动处理Cookie 可选项
Postdata = "",//Post数据 可选项GET时不需要写
ResultType = ResultType.String,//返回数据类型,是Byte还是String
};
CsharpHttpHelper.HttpResult result = http.GetHtml(item);
string html = result.Html;
string cookie = result.Cookie;
HtmlDocument Doc = new HtmlDocument();
Doc.LoadHtml(html);
var nodes = Doc.DocumentNode.SelectNodes("//a[starts-with(@class,'name')]");
string conmpanyurl = nodes[0].Attributes["href"].Value;
HttpItem itemcompany = new HttpItem()
{
URL = conmpanyurl,//URL 必需项
Method = "GET",//URL 可选项 默认为Get
Timeout = 100000,//连接超时时间 可选项默认为100000
ReadWriteTimeout = 30000,//写入Post数据超时时间 可选项默认为30000
IsToLower = false,//得到的HTML代码是否转成小写 可选项默认转小写
Cookie = "",//字符串Cookie 可选项
UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0",//用户的浏览器类型,版本,操作系统 可选项有默认值
Accept = "text/html, application/xhtml+xml, */*",// 可选项有默认值
ContentType = "text/html",//返回类型 可选项有默认值
Referer = "https://www.tianyancha.com",//来源URL 可选项
Allowautoredirect = false,//是否根据301跳转 可选项
AutoRedirectCookie = false,//是否自动处理Cookie 可选项
Postdata = "",//Post数据 可选项GET时不需要写
ResultType = ResultType.String,//返回数据类型,是Byte还是String
};
CsharpHttpHelper.HttpResult resultcompany = http.GetHtml(itemcompany);
string htmlcompany = resultcompany.Html;
HtmlDocument DocComp = new HtmlDocument();
DocComp.LoadHtml(htmlcompany);
//公司名
var name = DocComp.DocumentNode.SelectNodes("//h1[starts-with(@class,'name')]")[0].InnerHtml;
//公司状态 续存 在业
var state = DocComp.DocumentNode.SelectNodes("//div[starts-with(@class,'num-opening')]")[0].InnerHtml;
//统一信用代码
var codehtml = DocComp.DocumentNode.SelectNodes("//div[starts-with(@id,'_container_baseInfo')]")[0];
HtmlDocument codedoc = new HtmlDocument();
codedoc.LoadHtml(codehtml.InnerHtml);
codedoc.LoadHtml(codedoc.DocumentNode.SelectNodes("//table")[1].InnerHtml);
var code = codedoc.DocumentNode.SelectNodes("//td")[6].InnerHtml;
var zcode = codedoc.DocumentNode.SelectNodes("//td")[3].InnerHtml;
//法人名称
var bossname = DocComp.DocumentNode.SelectNodes("//div[starts-with(@class,'name')]//a")[0].InnerHtml;
DateTime enddt = DateTime.Now;
TimeSpan ts = enddt - startdt;
return $"{conmpany}: \r\n\n用时:{ts.Milliseconds }ms \r\n\n公司名:{name} \r\n\n公司状态:{state} \r\n\n统一社会信用代码:{code} \r\n\n组织机构代码:{zcode} \r\n\n法人姓名:{bossname}";
}抓取qichacha代码如下
/// <summary>
/// 获取企业信息
/// </summary>
/// <param name="conmpany">相关关键字</param>
/// <returns></returns>
private static string Test24(string conmpany)
{
DateTime startdt = DateTime.Now;
HttpHelper http = new HttpHelper();
HttpItem item = new HttpItem()
{
URL = "https://www.qichacha.com/search?key=" + conmpany + "",//URL 必需项
Method = "GET",//URL 可选项 默认为Get
Timeout = 100000,//连接超时时间 可选项默认为100000
ReadWriteTimeout = 30000,//写入Post数据超时时间 可选项默认为30000
IsToLower = false,//得到的HTML代码是否转成小写 可选项默认转小写
Cookie = "a",//字符串Cookie 可选项
UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.10 Safari/537.36",//用户的浏览器类型,版本,操作系统 可选项有默认值
Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",// 可选项有默认值
ContentType = "text/html",//返回类型 可选项有默认值
Referer = "https://www.qichacha.com",//来源URL 可选项
Allowautoredirect = true,//是否根据301跳转 可选项
AutoRedirectCookie = true,//是否自动处理Cookie 可选项
Postdata = "",//Post数据 可选项GET时不需要写
ResultType = ResultType.String,//返回数据类型,是Byte还是String
};
CsharpHttpHelper.HttpResult result = http.GetHtml(item);
string html = result.Html;
string cookie = result.Cookie;
HtmlDocument Doc = new HtmlDocument();
Doc.LoadHtml(html);
var nodes = Doc.DocumentNode.SelectNodes("//a[starts-with(@class,'ma_h1')]");
string conmpanyurl = nodes[0].Attributes["href"].Value;
if (string.IsNullOrWhiteSpace(conmpanyurl) == false)
{
conmpanyurl = "https://www.qichacha.com" + conmpanyurl;
}
HttpItem itemcompany = new HttpItem()
{
URL = conmpanyurl,//URL 必需项
Method = "GET",//URL 可选项 默认为Get
Timeout = 100000,//连接超时时间 可选项默认为100000
ReadWriteTimeout = 30000,//写入Post数据超时时间 可选项默认为30000
IsToLower = false,//得到的HTML代码是否转成小写 可选项默认转小写
Cookie = "aa",//字符串Cookie 可选项
UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0",//用户的浏览器类型,版本,操作系统 可选项有默认值
Accept = "text/html, application/xhtml+xml, */*",// 可选项有默认值
ContentType = "text/html",//返回类型 可选项有默认值
Referer = "https://www.qichacha.com",//来源URL 可选项
Allowautoredirect = false,//是否根据301跳转 可选项
AutoRedirectCookie = false,//是否自动处理Cookie 可选项
Postdata = "",//Post数据 可选项GET时不需要写
ResultType = ResultType.String,//返回数据类型,是Byte还是String
};
CsharpHttpHelper.HttpResult resultcompany = http.GetHtml(itemcompany);
string htmlcompany = resultcompany.Html;
HtmlDocument DocComp = new HtmlDocument();
DocComp.LoadHtml(htmlcompany);
//公司名
var namedom = DocComp.DocumentNode.SelectNodes("//h1");
if (namedom==null)
{
namedom= DocComp.DocumentNode.SelectNodes("//div[starts-with(@class,'row title')]");
}
var name = namedom[0].InnerHtml.Replace("\n", "");
//开始获取其他信息
var comhtml = DocComp.DocumentNode.SelectNodes("//section[starts-with(@id,'Cominfo')]")[0].InnerHtml;
DocComp.LoadHtml(comhtml);
var table1 = DocComp.DocumentNode.SelectNodes("//table")[0].InnerHtml;
HtmlDocument Dictable1 = new HtmlDocument();
Dictable1.LoadHtml(table1);
var table2 = DocComp.DocumentNode.SelectNodes("//table")[1].InnerHtml;
HtmlDocument Dictable2 = new HtmlDocument();
Dictable2.LoadHtml(table2);
//法人名称
var bossname = Dictable1.DocumentNode.SelectNodes("//h2[starts-with(@class,'seo font-20')]")[0].InnerHtml;
//公司状态 续存 在业
var state = Dictable2.DocumentNode.SelectNodes("//td")[5].InnerHtml.Replace("\n", "");
//统一信用代码
var code = Dictable2.DocumentNode.SelectNodes("//td")[9].InnerHtml.Replace("\n", "");
//组织机构代码
var zcode = Dictable2.DocumentNode.SelectNodes("//td")[15].InnerHtml.Replace("\n", "");
DateTime enddt = DateTime.Now;
TimeSpan ts = enddt - startdt;
return $"{conmpany}: \r\n\n用时:{ts.Milliseconds }ms \r\n\n公司名:{name} \r\n\n公司状态:{state} \r\n\n统一社会信用代码:{code} \r\n\n组织机构代码:{zcode} \r\n\n法人姓名:{bossname}";
}
川公网安备 51010702003150号
留下您的脚步
最近评论