diff --git a/PatreonDownloader.App/Program.cs b/PatreonDownloader.App/Program.cs index 67bcd0a..5d00040 100644 --- a/PatreonDownloader.App/Program.cs +++ b/PatreonDownloader.App/Program.cs @@ -113,6 +113,9 @@ private static async Task RunPatreonDownloader(CommandLineOptions commandLineOpt return; } + PatreonDownloaderSettings settings = await InitializeSettings(commandLineOptions); + PatreonCrawledUrlFilter filter = PatreonCrawledUrlFilter.GetInstance(settings); + _universalDownloader = new UniversalDownloader(new PatreonDownloaderModule()); _filesDownloaded = 0; @@ -124,8 +127,8 @@ private static async Task RunPatreonDownloader(CommandLineOptions commandLineOpt _universalDownloader.CrawlerMessage += UniversalDownloaderOnCrawlerMessage; _universalDownloader.FileDownloaded += UniversalDownloaderOnFileDownloaded; - PatreonDownloaderSettings settings = await InitializeSettings(commandLineOptions); await _universalDownloader.Download(commandLineOptions.Url, settings); + filter.SaveIgnorePostsToJson(); _universalDownloader.StatusChanged -= UniversalDownloaderOnStatusChanged; _universalDownloader.PostCrawlStart -= UniversalDownloaderOnPostCrawlStart; diff --git a/PatreonDownloader.Implementation/Models/JSONObjects/IgnorePosts.cs b/PatreonDownloader.Implementation/Models/JSONObjects/IgnorePosts.cs new file mode 100644 index 0000000..47274cd --- /dev/null +++ b/PatreonDownloader.Implementation/Models/JSONObjects/IgnorePosts.cs @@ -0,0 +1,10 @@ +using Newtonsoft.Json; + +namespace PatreonDownloader.Implementation.Models.JSONObjects.IgnorePosts +{ + public class IgnorePost + { + [JsonProperty("id")] + public string Id { get; set; } + } +} diff --git a/PatreonDownloader.Implementation/PatreonCookieValidator.cs b/PatreonDownloader.Implementation/PatreonCookieValidator.cs index 00d8f63..05e3fa8 100644 --- a/PatreonDownloader.Implementation/PatreonCookieValidator.cs +++ b/PatreonDownloader.Implementation/PatreonCookieValidator.cs @@ -26,7 +26,7 @@ public async Task ValidateCookies(CookieContainer cookieContainer) if (cookieContainer == null) throw new ArgumentNullException(nameof(cookieContainer)); - CookieCollection cookies = cookieContainer.GetCookies(new Uri("https://patreon.com")); + CookieCollection cookies = cookieContainer.GetAllCookies(); if (cookies["__cf_bm"] == null) throw new CookieValidationException("__cf_bm cookie not found"); diff --git a/PatreonDownloader.Implementation/PatreonCrawledUrlFilter.cs b/PatreonDownloader.Implementation/PatreonCrawledUrlFilter.cs new file mode 100644 index 0000000..bd712eb --- /dev/null +++ b/PatreonDownloader.Implementation/PatreonCrawledUrlFilter.cs @@ -0,0 +1,64 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using Newtonsoft.Json; +using PatreonDownloader.Implementation.Models.JSONObjects.IgnorePosts; +using UniversalDownloaderPlatform.Common.Interfaces.Models; + +namespace PatreonDownloader.Implementation; + +public class PatreonCrawledUrlFilter +{ + private static PatreonCrawledUrlFilter _instance; + private const string IgnorePostsFileName = "ignorePosts.json"; + private readonly List _ignorePosts ; + private readonly IUniversalDownloaderPlatformSettings _settings; + + private PatreonCrawledUrlFilter(IUniversalDownloaderPlatformSettings settings) + { + _settings = settings; + _ignorePosts = GetIgnorePostsFromJson(); + } + + public static PatreonCrawledUrlFilter GetInstance() + { + if(_instance == null) throw new Exception("Instance not initialized"); + return _instance; + } + + public static PatreonCrawledUrlFilter GetInstance(IUniversalDownloaderPlatformSettings settings) + { + return _instance ??= new PatreonCrawledUrlFilter(settings); + } + + private string GetIgnorePostsFilePath() + { + return $"{_settings.DownloadDirectory}/{IgnorePostsFileName}"; + } + + private List GetIgnorePostsFromJson() + { + if (!File.Exists(GetIgnorePostsFilePath())) return new List(); + + string json = File.ReadAllText(GetIgnorePostsFilePath()); + List jsonRoot = JsonConvert.DeserializeObject>(json); + return jsonRoot; + } + + public void SaveIgnorePostsToJson() + { + string json = JsonConvert.SerializeObject(_ignorePosts); + File.WriteAllText(GetIgnorePostsFilePath(), json); + } + + public void FilterOutPages(List crawledUrls) + { + crawledUrls.RemoveAll(x => _ignorePosts.Any(y => y.Id == x.PostId)); + } + + public void AddIgnorePost(IgnorePost ignorePost) + { + _ignorePosts.Add(ignorePost); + } +} \ No newline at end of file diff --git a/PatreonDownloader.Implementation/PatreonDefaultPlugin.cs b/PatreonDownloader.Implementation/PatreonDefaultPlugin.cs index 544c2b5..80d2bc8 100644 --- a/PatreonDownloader.Implementation/PatreonDefaultPlugin.cs +++ b/PatreonDownloader.Implementation/PatreonDefaultPlugin.cs @@ -9,6 +9,7 @@ using PatreonDownloader.Implementation; using PatreonDownloader.Implementation.Enums; using PatreonDownloader.Implementation.Interfaces; +using PatreonDownloader.Implementation.Models.JSONObjects.IgnorePosts; using UniversalDownloaderPlatform.Common.Exceptions; using UniversalDownloaderPlatform.Common.Interfaces; using UniversalDownloaderPlatform.Common.Interfaces.Models; @@ -24,6 +25,7 @@ namespace PatreonDownloader.Engine internal sealed class PatreonDefaultPlugin : IPlugin { private IWebDownloader _webDownloader; + private PatreonCrawledUrlFilter _crawledUrlFilter; private readonly Logger _logger = LogManager.GetCurrentClassLogger(); @@ -37,6 +39,7 @@ internal sealed class PatreonDefaultPlugin : IPlugin public PatreonDefaultPlugin(IWebDownloader webDownloader) { _webDownloader = webDownloader ?? throw new ArgumentNullException(nameof(webDownloader)); + _crawledUrlFilter = PatreonCrawledUrlFilter.GetInstance(); } public void OnLoad(IDependencyResolver dependencyResolver) @@ -61,6 +64,12 @@ public async Task Download(ICrawledUrl crawledUrl) throw new DownloadException($"Download path is not filled for {crawledUrl.Url}"); await _webDownloader.DownloadFile(crawledUrl.Url, Path.Combine(_settings.DownloadDirectory, crawledUrl.DownloadPath), null); //referer is set in PatreonWebDownloader + + PatreonCrawledUrl crawledUrlPatreon = crawledUrl as PatreonCrawledUrl; + if(crawledUrlPatreon == null) throw new DownloadException("Crawled url is not of type PatreonCrawledUrl"); + + IgnorePost ignorePost = new IgnorePost {Id = crawledUrlPatreon.PostId}; + _crawledUrlFilter.AddIgnorePost(ignorePost); } public Task BeforeStart(IUniversalDownloaderPlatformSettings settings) diff --git a/PatreonDownloader.Implementation/PatreonPageCrawler.cs b/PatreonDownloader.Implementation/PatreonPageCrawler.cs index ed0cec0..cf852ea 100644 --- a/PatreonDownloader.Implementation/PatreonPageCrawler.cs +++ b/PatreonDownloader.Implementation/PatreonPageCrawler.cs @@ -22,10 +22,11 @@ internal sealed class PatreonPageCrawler : IPageCrawler private readonly IPluginManager _pluginManager; private readonly Logger _logger = LogManager.GetCurrentClassLogger(); + private PatreonCrawledUrlFilter _patreonCrawledUrlFilter; private PatreonDownloaderSettings _patreonDownloaderSettings; public event EventHandler PostCrawlStart; - public event EventHandler PostCrawlEnd; + public event EventHandler PostCrawlEnd; public event EventHandler NewCrawledUrl; public event EventHandler CrawlerMessage; @@ -48,7 +49,8 @@ public PatreonPageCrawler(IWebDownloader webDownloader, IPluginManager pluginMan public async Task BeforeStart(IUniversalDownloaderPlatformSettings settings) { - _patreonDownloaderSettings = (PatreonDownloaderSettings) settings; + _patreonDownloaderSettings = (PatreonDownloaderSettings)settings; + _patreonCrawledUrlFilter = PatreonCrawledUrlFilter.GetInstance(settings); } public async Task> Crawl(ICrawlTargetInfo crawlTargetInfo) @@ -86,6 +88,7 @@ await File.WriteAllTextAsync(Path.Combine(_patreonDownloaderSettings.DownloadDir json); ParsingResult result = await ParsePage(json); + _patreonCrawledUrlFilter.FilterOutPages(result.CrawledUrls); if(result.CrawledUrls.Count > 0) crawledUrls.AddRange(result.CrawledUrls);