首选需要引用两个依赖的类库
1、HtmlAgilityPack 1.8.4.0
2、CsharpHttpHelper(苏飞论坛原创)
抓取tianyancha代码如下:
/// <summary> /// 获取企业信息 /// </summary> /// <param name="conmpany">企业关键字</param> /// <returns></returns> private static string Test21(string conmpany) { DateTime startdt = DateTime.Now; HttpHelper http = new HttpHelper(); HttpItem item = new HttpItem() { URL = "https://www.tianyancha.com/search?key=" + conmpany,//URL 必需项 Method = "GET",//URL 可选项 默认为Get Timeout = 100000,//连接超时时间 可选项默认为100000 ReadWriteTimeout = 30000,//写入Post数据超时时间 可选项默认为30000 IsToLower = false,//得到的HTML代码是否转成小写 可选项默认转小写 Cookie = "",//字符串Cookie 可选项 UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0",//用户的浏览器类型,版本,操作系统 可选项有默认值 Accept = "text/html, application/xhtml+xml, */*",// 可选项有默认值 ContentType = "text/html",//返回类型 可选项有默认值 Referer = "https://www.tianyancha.com",//来源URL 可选项 Allowautoredirect = false,//是否根据301跳转 可选项 AutoRedirectCookie = false,//是否自动处理Cookie 可选项 Postdata = "",//Post数据 可选项GET时不需要写 ResultType = ResultType.String,//返回数据类型,是Byte还是String }; CsharpHttpHelper.HttpResult result = http.GetHtml(item); string html = result.Html; string cookie = result.Cookie; HtmlDocument Doc = new HtmlDocument(); Doc.LoadHtml(html); var nodes = Doc.DocumentNode.SelectNodes("//a[starts-with(@class,'name')]"); string conmpanyurl = nodes[0].Attributes["href"].Value; HttpItem itemcompany = new HttpItem() { URL = conmpanyurl,//URL 必需项 Method = "GET",//URL 可选项 默认为Get Timeout = 100000,//连接超时时间 可选项默认为100000 ReadWriteTimeout = 30000,//写入Post数据超时时间 可选项默认为30000 IsToLower = false,//得到的HTML代码是否转成小写 可选项默认转小写 Cookie = "",//字符串Cookie 可选项 UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0",//用户的浏览器类型,版本,操作系统 可选项有默认值 Accept = "text/html, application/xhtml+xml, */*",// 可选项有默认值 ContentType = "text/html",//返回类型 可选项有默认值 Referer = "https://www.tianyancha.com",//来源URL 可选项 Allowautoredirect = false,//是否根据301跳转 可选项 AutoRedirectCookie = false,//是否自动处理Cookie 可选项 Postdata = "",//Post数据 可选项GET时不需要写 ResultType = ResultType.String,//返回数据类型,是Byte还是String }; CsharpHttpHelper.HttpResult resultcompany = http.GetHtml(itemcompany); string htmlcompany = resultcompany.Html; HtmlDocument DocComp = new HtmlDocument(); DocComp.LoadHtml(htmlcompany); //公司名 var name = DocComp.DocumentNode.SelectNodes("//h1[starts-with(@class,'name')]")[0].InnerHtml; //公司状态 续存 在业 var state = DocComp.DocumentNode.SelectNodes("//div[starts-with(@class,'num-opening')]")[0].InnerHtml; //统一信用代码 var codehtml = DocComp.DocumentNode.SelectNodes("//div[starts-with(@id,'_container_baseInfo')]")[0]; HtmlDocument codedoc = new HtmlDocument(); codedoc.LoadHtml(codehtml.InnerHtml); codedoc.LoadHtml(codedoc.DocumentNode.SelectNodes("//table")[1].InnerHtml); var code = codedoc.DocumentNode.SelectNodes("//td")[6].InnerHtml; var zcode = codedoc.DocumentNode.SelectNodes("//td")[3].InnerHtml; //法人名称 var bossname = DocComp.DocumentNode.SelectNodes("//div[starts-with(@class,'name')]//a")[0].InnerHtml; DateTime enddt = DateTime.Now; TimeSpan ts = enddt - startdt; return $"{conmpany}: \r\n\n用时:{ts.Milliseconds }ms \r\n\n公司名:{name} \r\n\n公司状态:{state} \r\n\n统一社会信用代码:{code} \r\n\n组织机构代码:{zcode} \r\n\n法人姓名:{bossname}"; }
抓取qichacha代码如下
/// <summary> /// 获取企业信息 /// </summary> /// <param name="conmpany">相关关键字</param> /// <returns></returns> private static string Test24(string conmpany) { DateTime startdt = DateTime.Now; HttpHelper http = new HttpHelper(); HttpItem item = new HttpItem() { URL = "https://www.qichacha.com/search?key=" + conmpany + "",//URL 必需项 Method = "GET",//URL 可选项 默认为Get Timeout = 100000,//连接超时时间 可选项默认为100000 ReadWriteTimeout = 30000,//写入Post数据超时时间 可选项默认为30000 IsToLower = false,//得到的HTML代码是否转成小写 可选项默认转小写 Cookie = "a",//字符串Cookie 可选项 UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.10 Safari/537.36",//用户的浏览器类型,版本,操作系统 可选项有默认值 Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",// 可选项有默认值 ContentType = "text/html",//返回类型 可选项有默认值 Referer = "https://www.qichacha.com",//来源URL 可选项 Allowautoredirect = true,//是否根据301跳转 可选项 AutoRedirectCookie = true,//是否自动处理Cookie 可选项 Postdata = "",//Post数据 可选项GET时不需要写 ResultType = ResultType.String,//返回数据类型,是Byte还是String }; CsharpHttpHelper.HttpResult result = http.GetHtml(item); string html = result.Html; string cookie = result.Cookie; HtmlDocument Doc = new HtmlDocument(); Doc.LoadHtml(html); var nodes = Doc.DocumentNode.SelectNodes("//a[starts-with(@class,'ma_h1')]"); string conmpanyurl = nodes[0].Attributes["href"].Value; if (string.IsNullOrWhiteSpace(conmpanyurl) == false) { conmpanyurl = "https://www.qichacha.com" + conmpanyurl; } HttpItem itemcompany = new HttpItem() { URL = conmpanyurl,//URL 必需项 Method = "GET",//URL 可选项 默认为Get Timeout = 100000,//连接超时时间 可选项默认为100000 ReadWriteTimeout = 30000,//写入Post数据超时时间 可选项默认为30000 IsToLower = false,//得到的HTML代码是否转成小写 可选项默认转小写 Cookie = "aa",//字符串Cookie 可选项 UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0",//用户的浏览器类型,版本,操作系统 可选项有默认值 Accept = "text/html, application/xhtml+xml, */*",// 可选项有默认值 ContentType = "text/html",//返回类型 可选项有默认值 Referer = "https://www.qichacha.com",//来源URL 可选项 Allowautoredirect = false,//是否根据301跳转 可选项 AutoRedirectCookie = false,//是否自动处理Cookie 可选项 Postdata = "",//Post数据 可选项GET时不需要写 ResultType = ResultType.String,//返回数据类型,是Byte还是String }; CsharpHttpHelper.HttpResult resultcompany = http.GetHtml(itemcompany); string htmlcompany = resultcompany.Html; HtmlDocument DocComp = new HtmlDocument(); DocComp.LoadHtml(htmlcompany); //公司名 var namedom = DocComp.DocumentNode.SelectNodes("//h1"); if (namedom==null) { namedom= DocComp.DocumentNode.SelectNodes("//div[starts-with(@class,'row title')]"); } var name = namedom[0].InnerHtml.Replace("\n", ""); //开始获取其他信息 var comhtml = DocComp.DocumentNode.SelectNodes("//section[starts-with(@id,'Cominfo')]")[0].InnerHtml; DocComp.LoadHtml(comhtml); var table1 = DocComp.DocumentNode.SelectNodes("//table")[0].InnerHtml; HtmlDocument Dictable1 = new HtmlDocument(); Dictable1.LoadHtml(table1); var table2 = DocComp.DocumentNode.SelectNodes("//table")[1].InnerHtml; HtmlDocument Dictable2 = new HtmlDocument(); Dictable2.LoadHtml(table2); //法人名称 var bossname = Dictable1.DocumentNode.SelectNodes("//h2[starts-with(@class,'seo font-20')]")[0].InnerHtml; //公司状态 续存 在业 var state = Dictable2.DocumentNode.SelectNodes("//td")[5].InnerHtml.Replace("\n", ""); //统一信用代码 var code = Dictable2.DocumentNode.SelectNodes("//td")[9].InnerHtml.Replace("\n", ""); //组织机构代码 var zcode = Dictable2.DocumentNode.SelectNodes("//td")[15].InnerHtml.Replace("\n", ""); DateTime enddt = DateTime.Now; TimeSpan ts = enddt - startdt; return $"{conmpany}: \r\n\n用时:{ts.Milliseconds }ms \r\n\n公司名:{name} \r\n\n公司状态:{state} \r\n\n统一社会信用代码:{code} \r\n\n组织机构代码:{zcode} \r\n\n法人姓名:{bossname}"; }
留下您的脚步
最近评论