今天我們來講講如何使用.NET開源(MIT License)的輕量、靈活、高性能、跨平臺(tái)的分布式網(wǎng)絡(luò)爬蟲框架DotnetSpider來快速實(shí)現(xiàn)網(wǎng)頁(yè)數(shù)據(jù)抓取功能。
注意:為了自身安全請(qǐng)?jiān)趪?guó)家法律允許范圍內(nèi)開發(fā)網(wǎng)頁(yè)爬蟲功能。
本文我們以抓取博客園10天推薦排行榜第一頁(yè)的文章標(biāo)題、文章簡(jiǎn)介和文章地址為示例,并把抓取下來的數(shù)據(jù)保存到對(duì)應(yīng)的txt文本中。
圖片
創(chuàng)建名為DotnetSpiderExercise的控制臺(tái)應(yīng)用。
圖片
圖片
圖片
NuGet包管理器搜索:DotnetSpider
圖片
NuGet包管理器搜索:Serilog.AspNetCore
圖片
namespace DotnetSpiderExercise{ public class RecommendedRankingModel { /// <summary> /// 文章標(biāo)題 /// </summary> public string ArticleTitle { get; set; } /// <summary> /// 文章簡(jiǎn)介 /// </summary> public string ArticleSummary { get; set; } /// <summary> /// 文章地址 /// </summary> public string ArticleUrl { get; set; } }}
網(wǎng)頁(yè)數(shù)據(jù)抓取的業(yè)務(wù)邏輯都在這里面。
using DotnetSpider.DataFlow.Parser;using DotnetSpider.DataFlow;using DotnetSpider.Downloader;using DotnetSpider.Http;using DotnetSpider.Scheduler.Component;using DotnetSpider.Selector;using DotnetSpider;using Microsoft.Extensions.Logging;using Microsoft.Extensions.Options;using Serilog;using DotnetSpider.Scheduler;using Microsoft.Extensions.Hosting;using System.Reflection;namespace DotnetSpiderExercise{ public class RecommendedRankingSpider : Spider { public RecommendedRankingSpider(IOptions<SpiderOptions> options, DependenceServices services, ILogger<Spider> logger) : base(options, services, logger) { } public static async Task RunAsync() { var builder = Builder.CreateDefaultBuilder<RecommendedRankingSpider>(); builder.UseSerilog(); builder.UseDownloader<HttpClientDownloader>(); builder.UseQueueDistinctBfsScheduler<HashSetDuplicateRemover>(); await builder.Build().RunAsync(); } protected override async Task InitializeAsync(CancellationToken stoppingToken = default) { //添加自定義解析 AddDataFlow(new Parser()); //使用控制臺(tái)存儲(chǔ)器 AddDataFlow(new ConsoleStorage()); //添加采集請(qǐng)求:博客園10天推薦排行榜 await AddRequestsAsync(new Request("https://www.cnblogs.com/aggsite/topdiggs") { //請(qǐng)求超時(shí)10秒 Timeout = 10000 }); } class Parser : DataParser { public override Task InitializeAsync() { return Task.CompletedTask; } protected override Task ParseAsync(DataFlowContext context) { var recommendedRankingList = new List<RecommendedRankingModel>(); // 網(wǎng)頁(yè)數(shù)據(jù)解析 var number = 1; var recommendedList = context.Selectable.SelectList(Selectors.XPath(".//article[@class='post-item']")); foreach (var news in recommendedList) { var articleTitle = news.Select(Selectors.XPath(".//a[@class='post-item-title']"))?.Value; var articleSummary = news.Select(Selectors.XPath(".//p[@class='post-item-summary']"))?.Value?.Replace("/n", "").Replace(" ", ""); var articleUrl = news.Select(Selectors.XPath(".//a[@class='post-item-title']/@href"))?.Value; Console.WriteLine($"第{number}篇文章 標(biāo)題:{articleTitle}"); recommendedRankingList.Add(new RecommendedRankingModel { ArticleTitle = articleTitle, ArticleSummary = articleSummary, ArticleUrl = articleUrl }); number++; } using (StreamWriter sw = new StreamWriter("RecommendedRanking.txt")) { foreach (RecommendedRankingModel model in recommendedRankingList) { string line = $"文章標(biāo)題:{model.ArticleTitle}/r/n文章簡(jiǎn)介:{model.ArticleSummary}/r/n文章地址:{model.ArticleUrl}"; sw.WriteLine(line + "/r/n ========================================================================================== /r/n"); } } return Task.CompletedTask; } } }}
namespace DotnetSpiderExercise{ public class Program { static async Task Main(string[] args) { Console.WriteLine("網(wǎng)頁(yè)數(shù)據(jù)抓取開始..."); await RecommendedRankingSpider.RunAsync(); Console.WriteLine("網(wǎng)頁(yè)數(shù)據(jù)抓取完成..."); } }}
圖片
圖片
圖片
更多項(xiàng)目實(shí)用功能和特性歡迎前往項(xiàng)目開源地址查看
本文鏈接:http://www.tebozhan.com/showinfo-26-90186-0.html我們一起聊聊.NET快速實(shí)現(xiàn)網(wǎng)頁(yè)數(shù)據(jù)抓取
聲明:本網(wǎng)頁(yè)內(nèi)容旨在傳播知識(shí),若有侵權(quán)等問題請(qǐng)及時(shí)與本網(wǎng)聯(lián)系,我們將在第一時(shí)間刪除處理。郵件:2376512515@qq.com
上一篇: 快手一季度營(yíng)收 294 億元同比增長(zhǎng) 16.6%,平均日活躍用戶達(dá) 3.94 億