Files
WebScrapeApp/OceanNetWorks/Form1.cs
2021-08-02 12:41:02 +02:00

108 lines
3.4 KiB
C#

using AngleSharp;
using AngleSharp.Dom;
using AngleSharp.Html.Dom;
using AngleSharp.Html.Parser;
using AngleSharp.Text;
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Net.Http;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
namespace OceanNetWorks
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private string Title { get; set; }
private string Url { get; set; }
private string siteUrl = "https://www.oceannetworks.ca/news/stories";
//private string siteUrlx = "https://www.finansportalen.se/aktiekurser/";
public string[] QueryTerms { get; } = { "Ocean", "Nature", "Pollution" };
internal async void ScrapeWebsite()
{
//var config = Configuration.Default
// .WithJs(); // from AngleSharp.Js
//var context = BrowsingContext.New(config);
CancellationTokenSource cancellationToken = new CancellationTokenSource();
HttpClient httpClient = new HttpClient();
HttpResponseMessage request = await httpClient.GetAsync(siteUrl);
cancellationToken.Token.ThrowIfCancellationRequested();
Stream response = await request.Content.ReadAsStreamAsync();
cancellationToken.Token.ThrowIfCancellationRequested();
HtmlParser parser = new HtmlParser();
IHtmlDocument document = parser.ParseDocument(response);
GetScrapeResults(document);
}
private void GetScrapeResults(IHtmlDocument document)
{
IEnumerable<IElement> articleLink = null;
foreach (var term in QueryTerms)
{
articleLink = document.All.Where(x =>
x.ClassName == "views-field views-field-nothing" &&
(x.ParentElement.InnerHtml.Contains(term) || x.ParentElement.InnerHtml.Contains(term.ToLower())));
}
if (articleLink.Any())
{
PrintResults(articleLink);
}
}
public void PrintResults(IEnumerable<IElement> articleLink)
{
// Clean Up Results: See Next Step
foreach (var element in articleLink)
{
CleanUpResults(element);
scraperBox.Text = $"{Title} - {Url}{Environment.NewLine}";
}
}
private void CleanUpResults(IElement result)
{
string htmlResult = result.InnerHtml.ReplaceFirst(" <span class=\"field-content\"><div><a href=\"", "https://www.oceannetworks.ca");
htmlResult = htmlResult.ReplaceFirst("\">", "*");
htmlResult = htmlResult.ReplaceFirst("</a></div>\n<div class=\"article-title-top\">", "-");
htmlResult = htmlResult.ReplaceFirst("</div>\n<hr></span> ", "");
SplitResults(htmlResult);
}
private void SplitResults(string htmlResult)
{
string[] splitResults = htmlResult.Split('*');
Url = splitResults[0];
Title = splitResults[1];
}
private void btnScrape_Click(object sender, EventArgs e)
{
ScrapeWebsite();
}
}
}