using AngleSharp; using AngleSharp.Dom; using AngleSharp.Html.Dom; using AngleSharp.Html.Parser; using AngleSharp.Text; using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.IO; using System.Linq; using System.Net.Http; using System.Text; using System.Threading; using System.Threading.Tasks; using System.Windows.Forms; namespace OceanNetWorks { public partial class Form1 : Form { public Form1() { InitializeComponent(); } private string Title { get; set; } private string Url { get; set; } private string siteUrl = "https://www.oceannetworks.ca/news/stories"; //private string siteUrlx = "https://www.finansportalen.se/aktiekurser/"; public string[] QueryTerms { get; } = { "Ocean", "Nature", "Pollution" }; internal async void ScrapeWebsite() { //var config = Configuration.Default // .WithJs(); // from AngleSharp.Js //var context = BrowsingContext.New(config); CancellationTokenSource cancellationToken = new CancellationTokenSource(); HttpClient httpClient = new HttpClient(); HttpResponseMessage request = await httpClient.GetAsync(siteUrl); cancellationToken.Token.ThrowIfCancellationRequested(); Stream response = await request.Content.ReadAsStreamAsync(); cancellationToken.Token.ThrowIfCancellationRequested(); HtmlParser parser = new HtmlParser(); IHtmlDocument document = parser.ParseDocument(response); GetScrapeResults(document); } private void GetScrapeResults(IHtmlDocument document) { IEnumerable articleLink = null; foreach (var term in QueryTerms) { articleLink = document.All.Where(x => x.ClassName == "views-field views-field-nothing" && (x.ParentElement.InnerHtml.Contains(term) || x.ParentElement.InnerHtml.Contains(term.ToLower()))); } if (articleLink.Any()) { PrintResults(articleLink); } } public void PrintResults(IEnumerable articleLink) { // Clean Up Results: See Next Step foreach (var element in articleLink) { CleanUpResults(element); scraperBox.Text = $"{Title} - {Url}{Environment.NewLine}"; } } private void CleanUpResults(IElement result) { string htmlResult = result.InnerHtml.ReplaceFirst("
", "*"); htmlResult = htmlResult.ReplaceFirst("
\n
", "-"); htmlResult = htmlResult.ReplaceFirst("
\n
", ""); SplitResults(htmlResult); } private void SplitResults(string htmlResult) { string[] splitResults = htmlResult.Split('*'); Url = splitResults[0]; Title = splitResults[1]; } private void btnScrape_Click(object sender, EventArgs e) { ScrapeWebsite(); } } }