diff --git a/CsharpCorne/CsharpCorner.csproj b/CsharpCorne/CsharpCorner.csproj new file mode 100644 index 0000000..9d5ec29 --- /dev/null +++ b/CsharpCorne/CsharpCorner.csproj @@ -0,0 +1,12 @@ + + + + Exe + net5.0 + + + + + + + diff --git a/CsharpCorne/Program.cs b/CsharpCorne/Program.cs new file mode 100644 index 0000000..da2061b --- /dev/null +++ b/CsharpCorne/Program.cs @@ -0,0 +1,17 @@ +using HtmlAgilityPack; +using System; +using System.Linq; + +namespace CsharpCorner +{ + class Program + { + static void Main(string[] args) + { + var web = new HtmlWeb(); + var doc = web.Load("https://www.avanza.se/aktier/lista.html"); + var headerNames = doc.DocumentNode + .SelectNodes("//a[@class='ellipsis']").ToList(); + } + } +} diff --git a/HeadlessBrowser/HeadlessBrowser.csproj b/HeadlessBrowser/HeadlessBrowser.csproj new file mode 100644 index 0000000..7affca9 --- /dev/null +++ b/HeadlessBrowser/HeadlessBrowser.csproj @@ -0,0 +1,12 @@ + + + + Exe + net5.0 + + + + + + + diff --git a/HeadlessBrowser/Program.cs b/HeadlessBrowser/Program.cs new file mode 100644 index 0000000..8af5ffc --- /dev/null +++ b/HeadlessBrowser/Program.cs @@ -0,0 +1,161 @@ +using Microsoft.AspNetCore.Razor.Language; +using RazorEngine.Templating; +using SimpleBrowser; +using System; +using System.Diagnostics; +using System.IO; +using System.Threading.Tasks; + +namespace HeadlessBrowser +{ + class Program + { + private static async Task Main(string[] args) + { + using Browser browser = new Browser(); + try + { + // log the browser request/response data to files so we can interrogate them in case of an issue with our scraping + browser.RequestLogged += OnBrowserRequestLogged; + browser.MessageLogged += new Action(OnBrowserMessageLogged); + + // we'll fake the user agent for websites that alter their content for unrecognised browsers + browser.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10"; + + // browse to GitHub +// await browser.NavigateAsync("https://github.com/"); + await browser.NavigateAsync("https://www.di.se/bors/large-cap/"); + if (LastRequestFailed(browser)) + { + // always check the last request in case the page failed to load + return; + } + + // click the login link and click it + //browser.Log("First we need to log in, so browse to the login page, fill in the login details and submit the form."); + //HtmlResult loginLink = browser.Find("a", FindBy.Value, "Sign in"); + //if (!loginLink.Exists) + //{ + // browser.Log("Can't find the login link! Perhaps the site is down for maintenance?"); + //} + //else + //{ + // await loginLink.ClickAsync(); + // if (LastRequestFailed(browser)) + // { + // return; + // } + + // // fill in the form and click the login button - the fields are easy to locate because they have ID attributes + // browser.Find("login_field").Value = "tfoman"; + // browser.Find("password").Value = "Jes@lin78"; + // await browser.Find(ElementType.Button, "name", "commit").ClickAsync(); + // if (LastRequestFailed(browser)) + // { + // return; + // } + + // // see if the login succeeded - ContainsText() is very forgiving, so don't worry about whitespace, casing, html tags separating the text, etc. + // if (browser.ContainsText("Incorrect username or password")) + // { + // browser.Log("Login failed!", LogMessageType.Error); + // } + // else + { + //// After logging in, we should check that the page contains elements that we recognise + //if (!browser.ContainsText("Your Repositories")) + //{ + // browser.Log("There wasn't the usual login failure message, but the text we normally expect isn't present on the page"); + //} + //else + //{ + // browser.Log("Your News Feed:"); + // // we can use simple jquery selectors, though advanced selectors are yet to be implemented + // foreach (HtmlResult item in browser.Select("div.news .title")) + // { + // browser.Log("* " + item.Value); + // } + //} + // After logging in, we should check that the page contains elements that we recognise + if (!browser.ContainsText("Kurser")) + { + browser.Log("There wasn't the usual login failure message, but the text we normally expect isn't present on the page"); + } + else + { + browser.Log("Your Stock Prices:"); + // we can use simple jquery selectors, though advanced selectors are yet to be implemented + foreach (HtmlResult item in browser.Select(".market__content .i-t__c--m , tr")) + { + browser.Log("* " + item.Value); + } + } + } + // } + } + catch (Exception ex) + { + browser.Log(ex.Message, LogMessageType.Error); + browser.Log(ex.StackTrace, LogMessageType.StackTrace); + } + finally + { + RenderService rsvc = new RenderService(); + + string path = WriteFile("log-" + DateTime.UtcNow.Ticks + ".html", browser.RenderHtmlLogFile(rsvc, "SimpleBrowser Sample - Request Log")); + + Console.WriteLine("Log file published to:"); + Console.WriteLine(path); + + var process = new Process(); + process.StartInfo.FileName = path; + process.StartInfo.UseShellExecute = true; + process.Start(); + } + } + + private static bool LastRequestFailed(Browser browser) + { + if (browser.LastWebException != null) + { + browser.Log("There was an error loading the page: " + browser.LastWebException.Message); + return true; + } + return false; + } + + private static void OnBrowserMessageLogged(Browser browser, string log) + { + Console.WriteLine(log); + } + + private static void OnBrowserRequestLogged(Browser req, HttpRequestLog log) + { + Console.WriteLine(" -> " + log.Method + " request to " + log.Url); + Console.WriteLine(" <- Response status code: " + log.ResponseCode); + } + + private static string WriteFile(string filename, string text) + { + DirectoryInfo dir = new DirectoryInfo(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Logs")); + if (!dir.Exists) + { + dir.Create(); + } + + string path = Path.Combine(dir.FullName, filename); + File.WriteAllText(path, text); + return path; + } + } + + public class RenderService : HtmlLogFormatter.IViewRenderService + { + public string RenderToString(string template, string title, TModel model) + { + return RazorEngine.Engine.Razor.RunCompile(template, title, model.GetType(), model); + } + } + +} + diff --git a/WebScrape/Program.cs b/WebScrape/Program.cs new file mode 100644 index 0000000..19c7cca --- /dev/null +++ b/WebScrape/Program.cs @@ -0,0 +1,62 @@ +using HtmlAgilityPack; +using ScrapySharp.Network; +using ScrapySharp.Extensions; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Net.Http; +using System.Text; +using System.Threading.Tasks; +using System.Xml; + +namespace WebScrape +{ + class Program + { + + static ScrapingBrowser _scrapingBrowser = new ScrapingBrowser(); + static void Main(string[] args) + { + + //GetHtmlAsync(); + //GetHtml("https://www.di.se/bors/aktier"); + GetHtml("https://www.avanza.se/aktier/lista.html"); + + Console.ReadKey(); + + } + + static HtmlNode GetHtml(string url) + { + WebPage webPage = _scrapingBrowser.NavigateToPage(new Uri(url)); + return webPage.Html; + } + + //private static async void GetHtmlAsync() + //{ + // //var url = "https://www.di.se/bors/aktier"; + // var url = "https://www.di.se/bors/aktier/?data%5Bcountry%5D=SE&data%5Bmarket%5D=35207&data%5Bmarket%5D=35208&data%5Bmarket%5D=35209&data%5Bsector%5D=1&data%5Bsector%5D=2&data%5Bsector%5D=3&data%5Bsector%5D=4&data%5Bsector%5D=5&data%5Bsector%5D=6&data%5Bsector%5D=7&data%5Bsector%5D=8&data%5Bsector%5D=9&data%5Bsector%5D=10&field=name&tab=0"; + + // var httpClient = new HttpClient(); + // var html = await httpClient.GetStringAsync(url); + + // var htmlDocument = new HtmlDocument(); + // htmlDocument.LoadHtml(html); + + // var shareList = htmlDocument.DocumentNode.Descendants("table") + // .Where(node => node.GetAttributeValue("data-tab", "") + // .Equals("table_0")).ToList(); + + // var totLst = new List(); + // foreach(var htmldoc in shareList) + // { + // var trow = htmldoc.Descendants("tr") + // .Where(node => node.GetAttributeValue("id", "") + // .Contains("ins_")).ToList(); + // totLst.AddRange(trow); + // } + + // Console.WriteLine(); + //} + } +} diff --git a/WebScrape/WebScrape.csproj b/WebScrape/WebScrape.csproj new file mode 100644 index 0000000..a126e84 --- /dev/null +++ b/WebScrape/WebScrape.csproj @@ -0,0 +1,13 @@ + + + + Exe + net5.0 + + + + + + + + diff --git a/WebScrapeApp.sln b/WebScrapeApp.sln new file mode 100644 index 0000000..eaa3678 --- /dev/null +++ b/WebScrapeApp.sln @@ -0,0 +1,37 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.31313.79 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "WebScrape", "WebScrape\WebScrape.csproj", "{DF64E211-B223-48C5-83F8-60792D8DFB04}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CsharpCorner", "CsharpCorne\CsharpCorner.csproj", "{BF147D4E-3654-4FF7-9B71-7E1B67CE7B0D}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "HeadlessBrowser", "HeadlessBrowser\HeadlessBrowser.csproj", "{F0E685F6-9B05-42D1-B2F2-B14C861668A3}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {DF64E211-B223-48C5-83F8-60792D8DFB04}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {DF64E211-B223-48C5-83F8-60792D8DFB04}.Debug|Any CPU.Build.0 = Debug|Any CPU + {DF64E211-B223-48C5-83F8-60792D8DFB04}.Release|Any CPU.ActiveCfg = Release|Any CPU + {DF64E211-B223-48C5-83F8-60792D8DFB04}.Release|Any CPU.Build.0 = Release|Any CPU + {BF147D4E-3654-4FF7-9B71-7E1B67CE7B0D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {BF147D4E-3654-4FF7-9B71-7E1B67CE7B0D}.Debug|Any CPU.Build.0 = Debug|Any CPU + {BF147D4E-3654-4FF7-9B71-7E1B67CE7B0D}.Release|Any CPU.ActiveCfg = Release|Any CPU + {BF147D4E-3654-4FF7-9B71-7E1B67CE7B0D}.Release|Any CPU.Build.0 = Release|Any CPU + {F0E685F6-9B05-42D1-B2F2-B14C861668A3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {F0E685F6-9B05-42D1-B2F2-B14C861668A3}.Debug|Any CPU.Build.0 = Debug|Any CPU + {F0E685F6-9B05-42D1-B2F2-B14C861668A3}.Release|Any CPU.ActiveCfg = Release|Any CPU + {F0E685F6-9B05-42D1-B2F2-B14C861668A3}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {998091CC-0A32-4109-B1B5-0AA12A3E5D5C} + EndGlobalSection +EndGlobal