dags att checka in
This commit is contained in:
107
OceanNetWorks/Form1.cs
Normal file
107
OceanNetWorks/Form1.cs
Normal file
@ -0,0 +1,107 @@
|
||||
using AngleSharp;
|
||||
using AngleSharp.Dom;
|
||||
using AngleSharp.Html.Dom;
|
||||
using AngleSharp.Html.Parser;
|
||||
using AngleSharp.Text;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.ComponentModel;
|
||||
using System.Data;
|
||||
using System.Drawing;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Net.Http;
|
||||
using System.Text;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using System.Windows.Forms;
|
||||
|
||||
namespace OceanNetWorks
|
||||
{
|
||||
public partial class Form1 : Form
|
||||
{
|
||||
public Form1()
|
||||
{
|
||||
InitializeComponent();
|
||||
}
|
||||
|
||||
private string Title { get; set; }
|
||||
private string Url { get; set; }
|
||||
private string siteUrl = "https://www.oceannetworks.ca/news/stories";
|
||||
//private string siteUrlx = "https://www.finansportalen.se/aktiekurser/";
|
||||
public string[] QueryTerms { get; } = { "Ocean", "Nature", "Pollution" };
|
||||
|
||||
|
||||
|
||||
internal async void ScrapeWebsite()
|
||||
{
|
||||
//var config = Configuration.Default
|
||||
// .WithJs(); // from AngleSharp.Js
|
||||
//var context = BrowsingContext.New(config);
|
||||
|
||||
CancellationTokenSource cancellationToken = new CancellationTokenSource();
|
||||
HttpClient httpClient = new HttpClient();
|
||||
HttpResponseMessage request = await httpClient.GetAsync(siteUrl);
|
||||
|
||||
cancellationToken.Token.ThrowIfCancellationRequested();
|
||||
|
||||
Stream response = await request.Content.ReadAsStreamAsync();
|
||||
cancellationToken.Token.ThrowIfCancellationRequested();
|
||||
|
||||
HtmlParser parser = new HtmlParser();
|
||||
|
||||
IHtmlDocument document = parser.ParseDocument(response);
|
||||
|
||||
GetScrapeResults(document);
|
||||
}
|
||||
|
||||
private void GetScrapeResults(IHtmlDocument document)
|
||||
{
|
||||
IEnumerable<IElement> articleLink = null;
|
||||
|
||||
foreach (var term in QueryTerms)
|
||||
{
|
||||
articleLink = document.All.Where(x =>
|
||||
x.ClassName == "views-field views-field-nothing" &&
|
||||
(x.ParentElement.InnerHtml.Contains(term) || x.ParentElement.InnerHtml.Contains(term.ToLower())));
|
||||
}
|
||||
|
||||
if (articleLink.Any())
|
||||
{
|
||||
PrintResults(articleLink);
|
||||
}
|
||||
}
|
||||
|
||||
public void PrintResults(IEnumerable<IElement> articleLink)
|
||||
{
|
||||
// Clean Up Results: See Next Step
|
||||
foreach (var element in articleLink)
|
||||
{
|
||||
CleanUpResults(element);
|
||||
scraperBox.Text = $"{Title} - {Url}{Environment.NewLine}";
|
||||
}
|
||||
}
|
||||
|
||||
private void CleanUpResults(IElement result)
|
||||
{
|
||||
string htmlResult = result.InnerHtml.ReplaceFirst(" <span class=\"field-content\"><div><a href=\"", "https://www.oceannetworks.ca");
|
||||
htmlResult = htmlResult.ReplaceFirst("\">", "*");
|
||||
htmlResult = htmlResult.ReplaceFirst("</a></div>\n<div class=\"article-title-top\">", "-");
|
||||
htmlResult = htmlResult.ReplaceFirst("</div>\n<hr></span> ", "");
|
||||
|
||||
SplitResults(htmlResult);
|
||||
}
|
||||
|
||||
private void SplitResults(string htmlResult)
|
||||
{
|
||||
string[] splitResults = htmlResult.Split('*');
|
||||
Url = splitResults[0];
|
||||
Title = splitResults[1];
|
||||
}
|
||||
|
||||
private void btnScrape_Click(object sender, EventArgs e)
|
||||
{
|
||||
ScrapeWebsite();
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user