From 8d198d46e03373aa943e6c6667dfce7f78f85cdd Mon Sep 17 00:00:00 2001 From: David Claeys Date: Fri, 10 May 2024 16:06:42 +0200 Subject: [PATCH] initial commit --- .dockerignore | 30 + .gitea/workflows/action.yml | 35 ++ .gitignore | 1 + Dockerfile | 48 ++ README.md | 5 +- TelebilbaoEpg.Database/Models/BroadCast.cs | 20 + .../Repositories/BaseRepository.cs | 26 + .../Repositories/BroadCastRepository.cs | 33 + .../Repositories/IBroadCastRepository.cs | 15 + .../TelebilbaoEpg.Database.csproj | 12 + TelebilbaoEpg.sln | 31 + .../Controllers/BroadCastController.cs | 31 + TelebilbaoEpg/Jobs/ScrapeJob.cs | 567 ++++++++++++++++++ TelebilbaoEpg/Program.cs | 64 ++ TelebilbaoEpg/Properties/launchSettings.json | 52 ++ TelebilbaoEpg/TelebilbaoEpg.csproj | 26 + TelebilbaoEpg/appsettings.Development.json | 13 + TelebilbaoEpg/appsettings.json | 14 + 18 files changed, 1022 insertions(+), 1 deletion(-) create mode 100644 .dockerignore create mode 100644 .gitea/workflows/action.yml create mode 100644 Dockerfile create mode 100644 TelebilbaoEpg.Database/Models/BroadCast.cs create mode 100644 TelebilbaoEpg.Database/Repositories/BaseRepository.cs create mode 100644 TelebilbaoEpg.Database/Repositories/BroadCastRepository.cs create mode 100644 TelebilbaoEpg.Database/Repositories/IBroadCastRepository.cs create mode 100644 TelebilbaoEpg.Database/TelebilbaoEpg.Database.csproj create mode 100644 TelebilbaoEpg.sln create mode 100644 TelebilbaoEpg/Controllers/BroadCastController.cs create mode 100644 TelebilbaoEpg/Jobs/ScrapeJob.cs create mode 100644 TelebilbaoEpg/Program.cs create mode 100644 TelebilbaoEpg/Properties/launchSettings.json create mode 100644 TelebilbaoEpg/TelebilbaoEpg.csproj create mode 100644 TelebilbaoEpg/appsettings.Development.json create mode 100644 TelebilbaoEpg/appsettings.json diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..fe1152b --- /dev/null +++ b/.dockerignore @@ -0,0 +1,30 @@ +**/.classpath +**/.dockerignore +**/.env +**/.git +**/.gitignore +**/.project +**/.settings +**/.toolstarget +**/.vs +**/.vscode +**/*.*proj.user +**/*.dbmdl +**/*.jfm +**/azds.yaml +**/bin +**/charts +**/docker-compose* +**/Dockerfile* +**/node_modules +**/npm-debug.log +**/obj +**/secrets.dev.yaml +**/values.dev.yaml +LICENSE +README.md +!**/.gitignore +!.git/HEAD +!.git/config +!.git/packed-refs +!.git/refs/heads/** \ No newline at end of file diff --git a/.gitea/workflows/action.yml b/.gitea/workflows/action.yml new file mode 100644 index 0000000..5cdf44f --- /dev/null +++ b/.gitea/workflows/action.yml @@ -0,0 +1,35 @@ +name: 'Build docker container' +on: [push] +jobs: + build: + runs-on: ubuntu-latest + defaults: + run: + working-directory: ${{ GITHUB_WORKSPACE }} + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Install Docker + run: | + echo "Checking docker installation" + if command -v docker &> /dev/null; then + echo "Docker installation found" + else + echo "Docker installation not found. Docker will be installed" + curl -fsSL https://get.docker.com | sh + fi + - name: Set up Docker Buildx + uses: https://github.com/docker/setup-buildx-action@v3 + - name: Docker login + uses: https://github.com/docker/login-action@v3 + with: + registry: git.claeyscloud.com + username: nologin + password: ${{ secrets.PACKAGE_TOKEN }} + - name: Build and push + uses: https://github.com/docker/build-push-action@v5 + with: + context: . + push: true + tags: | + git.claeyscloud.com/david/telebilbao-epg diff --git a/.gitignore b/.gitignore index ca1c7a3..6e6be2d 100644 --- a/.gitignore +++ b/.gitignore @@ -398,3 +398,4 @@ FodyWeavers.xsd # JetBrains Rider *.sln.iml +/TelebilbaoEpg/telebilbaoEpg.db diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..155f8f3 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,48 @@ +ARG CERT_PASSWORD_ARG=3vo3rmb5DBJXsryjMfJsrpjbKsbj8B +FROM mcr.microsoft.com/dotnet/sdk:8.0-alpine-amd64 as build-env +ARG CERT_PASSWORD_ARG +ENV CERT_PASSWORD=$CERT_PASSWORD_ARG +WORKDIR /App +COPY . ./ +RUN dotnet restore \ + && dotnet publish TelebilbaoEpg/TelebilbaoEpg.csproj --no-restore --self-contained false -c Release -o out /p:UseAppHost=false \ + && dotnet dev-certs https --export-path /config/aspnetapp.pem --password "$CERT_PASSWORD" --format PEM \ + && rm **/appsettings.Development.json && rm **/*.pdb + +FROM mcr.microsoft.com/dotnet/aspnet:8.0-alpine-amd64 +ARG CERT_PASSWORD_ARG +ENV CERT_PASSWORD=$CERT_PASSWORD_ARG +WORKDIR /App +RUN apk update \ + && apk upgrade --available \ + && apk add ca-certificates \ + && apk add tzdata \ + && mkdir /config && mkdir -p /usr/local/share/ca-certificates/ +COPY --from=build-env /App/out . +COPY --from=build-env /config /config +ENV DOTNET_CLI_TELEMETRY_OPTOUT=1 +ENV ASPNETCORE_ENVIRONMENT=Production +ENV ASPNETCORE_URLS=http://+:80;https://+:443 +ENV ASPNETCORE_Kestrel__Certificates__Default__Path=/usr/local/share/ca-certificates/aspnetapp.crt +ENV ASPNETCORE_Kestrel__Certificates__Default__KeyPath=/usr/local/share/ca-certificates/aspnetapp.key +ENV ASPNETCORE_Kestrel__Certificates__Default__Password=$CERT_PASSWORD +RUN chown -R app:app /App/* \ + && cp /config/aspnetapp.pem $ASPNETCORE_Kestrel__Certificates__Default__Path \ + && cp /config/aspnetapp.key $ASPNETCORE_Kestrel__Certificates__Default__KeyPath \ + && chmod 755 $ASPNETCORE_Kestrel__Certificates__Default__Path && chmod +x $ASPNETCORE_Kestrel__Certificates__Default__Path \ + && chown app:app $ASPNETCORE_Kestrel__Certificates__Default__Path \ + && cat $ASPNETCORE_Kestrel__Certificates__Default__Path >> /etc/ssl/certs/ca-certificates.crt \ + && chmod 755 $ASPNETCORE_Kestrel__Certificates__Default__KeyPath && chmod +x $ASPNETCORE_Kestrel__Certificates__Default__KeyPath \ + && chown app:app $ASPNETCORE_Kestrel__Certificates__Default__KeyPath \ + && rm -rf /tmp && mkdir /tmp && chmod 755 /tmp && chown app:app /tmp \ + && update-ca-certificates \ + && rm -rf /config \ + && rm -rf /var/cache/apk/* \ + && mkdir /data && chmod 755 /data \ + && cat > /data/telebilbaoEpg.db \ + && chmod 777 /data/telebilbaoEpg.db \ + && chown -R app:app /data/* + +ENTRYPOINT ["dotnet", "TelebilbaoEpg.dll"] +EXPOSE 80 +EXPOSE 443 \ No newline at end of file diff --git a/README.md b/README.md index 27085f2..84a264e 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,6 @@ # Telebilbao-Epg -Scraper with api for telebilbao epg \ No newline at end of file +Scraper with api for telebilbao epg + +Epg information for this local tv station is available at https://www.telebilbao.es/programacion/ +This program tries to parse the information on that page and to expose it with an api. \ No newline at end of file diff --git a/TelebilbaoEpg.Database/Models/BroadCast.cs b/TelebilbaoEpg.Database/Models/BroadCast.cs new file mode 100644 index 0000000..7c44174 --- /dev/null +++ b/TelebilbaoEpg.Database/Models/BroadCast.cs @@ -0,0 +1,20 @@ +using SQLite; +using System; +namespace TelebilbaoEpg.Database.Models +{ + public class BroadCast + { + [PrimaryKey, AutoIncrement] + public int Id { get; set; } + + public DateTime From { get; set; } + + public DateTime To { get; set; } + + public string Name { get; set; } = string.Empty; + + public string Description { get; set; } = string.Empty; + + public string ImageUrl { get; set; } = string.Empty; + } +} diff --git a/TelebilbaoEpg.Database/Repositories/BaseRepository.cs b/TelebilbaoEpg.Database/Repositories/BaseRepository.cs new file mode 100644 index 0000000..46bc04e --- /dev/null +++ b/TelebilbaoEpg.Database/Repositories/BaseRepository.cs @@ -0,0 +1,26 @@ +using SQLite; +using System.IO; +using TelebilbaoEpg.Database.Models; + +namespace TelebilbaoEpg.Database.Repository +{ + public abstract class BaseRepository + { + protected SQLiteConnection _db; + + public BaseRepository() + { + var storeFile = "/data/telebilbaoEpg.db"; + +#if DEBUG + storeFile = storeFile.Replace("/data/", ""); +#endif + + // Get an absolute path to the database file + var databasePath = Path.Combine(Directory.GetCurrentDirectory(), storeFile); + + _db = new SQLiteConnection(databasePath); + _db.CreateTable(); + } + } +} diff --git a/TelebilbaoEpg.Database/Repositories/BroadCastRepository.cs b/TelebilbaoEpg.Database/Repositories/BroadCastRepository.cs new file mode 100644 index 0000000..d41dc55 --- /dev/null +++ b/TelebilbaoEpg.Database/Repositories/BroadCastRepository.cs @@ -0,0 +1,33 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using TelebilbaoEpg.Database.Models; + +namespace TelebilbaoEpg.Database.Repository +{ + public class BroadCastRepository : BaseRepository, IBroadCastRepository + { + public void Add(BroadCast broadCast) + { + _db.Insert(broadCast); + } + + public List GetBroadCasts(DateOnly day) + { + return _db.Table() + .ToList() + .Where(b => DateOnly.FromDateTime(b.From.Date) == day || DateOnly.FromDateTime(b.To) == day) + .OrderBy(b => b.From) + .ToList(); + } + + public List GetBroadCasts(DateOnly from, DateOnly to) + { + return _db.Table() + .ToList() + .Where(b => (DateOnly.FromDateTime(b.From) >= from || DateOnly.FromDateTime(b.To) >= from) && (DateOnly.FromDateTime(b.From) <= to || DateOnly.FromDateTime(b.To) <= to)) + .OrderBy(b => b.From) + .ToList(); + } + } +} diff --git a/TelebilbaoEpg.Database/Repositories/IBroadCastRepository.cs b/TelebilbaoEpg.Database/Repositories/IBroadCastRepository.cs new file mode 100644 index 0000000..0b36f63 --- /dev/null +++ b/TelebilbaoEpg.Database/Repositories/IBroadCastRepository.cs @@ -0,0 +1,15 @@ +using System; +using System.Collections.Generic; +using TelebilbaoEpg.Database.Models; + +namespace TelebilbaoEpg.Database.Repository +{ + public interface IBroadCastRepository + { + List GetBroadCasts(DateOnly day); + + List GetBroadCasts(DateOnly from, DateOnly to); + + void Add(BroadCast broadCast); + } +} diff --git a/TelebilbaoEpg.Database/TelebilbaoEpg.Database.csproj b/TelebilbaoEpg.Database/TelebilbaoEpg.Database.csproj new file mode 100644 index 0000000..41595ab --- /dev/null +++ b/TelebilbaoEpg.Database/TelebilbaoEpg.Database.csproj @@ -0,0 +1,12 @@ + + + + net8 + enable + + + + + + + diff --git a/TelebilbaoEpg.sln b/TelebilbaoEpg.sln new file mode 100644 index 0000000..e4de315 --- /dev/null +++ b/TelebilbaoEpg.sln @@ -0,0 +1,31 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.9.34728.123 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "TelebilbaoEpg", "TelebilbaoEpg\TelebilbaoEpg.csproj", "{D9128F6D-C239-40E0-83DB-E98FEE81B5F8}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "TelebilbaoEpg.Database", "TelebilbaoEpg.Database\TelebilbaoEpg.Database.csproj", "{ABDB2C86-77F1-4E3F-A224-F1066323BCDE}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {D9128F6D-C239-40E0-83DB-E98FEE81B5F8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {D9128F6D-C239-40E0-83DB-E98FEE81B5F8}.Debug|Any CPU.Build.0 = Debug|Any CPU + {D9128F6D-C239-40E0-83DB-E98FEE81B5F8}.Release|Any CPU.ActiveCfg = Release|Any CPU + {D9128F6D-C239-40E0-83DB-E98FEE81B5F8}.Release|Any CPU.Build.0 = Release|Any CPU + {ABDB2C86-77F1-4E3F-A224-F1066323BCDE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {ABDB2C86-77F1-4E3F-A224-F1066323BCDE}.Debug|Any CPU.Build.0 = Debug|Any CPU + {ABDB2C86-77F1-4E3F-A224-F1066323BCDE}.Release|Any CPU.ActiveCfg = Release|Any CPU + {ABDB2C86-77F1-4E3F-A224-F1066323BCDE}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {CB4CE024-1BA3-404B-A8F8-E60361DE0384} + EndGlobalSection +EndGlobal diff --git a/TelebilbaoEpg/Controllers/BroadCastController.cs b/TelebilbaoEpg/Controllers/BroadCastController.cs new file mode 100644 index 0000000..4f86cf1 --- /dev/null +++ b/TelebilbaoEpg/Controllers/BroadCastController.cs @@ -0,0 +1,31 @@ +using Microsoft.AspNetCore.Mvc; +using TelebilbaoEpg.Database.Models; +using TelebilbaoEpg.Database.Repository; + +namespace TelebilbaoEpg.Controllers +{ + [Route("api/[controller]")] + [ApiController] + public class BroadCastController : ControllerBase + { + private IBroadCastRepository _broadCastRepository; + + public BroadCastController(IBroadCastRepository broadCastRepository) + { + _broadCastRepository = broadCastRepository; + } + + [HttpGet("today")] + public List GetToday() + { + var today = DateTime.Now.Date; + return _broadCastRepository.GetBroadCasts(DateOnly.FromDateTime(today)); + } + + [HttpGet] + public List Get(DateOnly from, DateOnly to) + { + return _broadCastRepository.GetBroadCasts(from, to); + } + } +} diff --git a/TelebilbaoEpg/Jobs/ScrapeJob.cs b/TelebilbaoEpg/Jobs/ScrapeJob.cs new file mode 100644 index 0000000..6844c59 --- /dev/null +++ b/TelebilbaoEpg/Jobs/ScrapeJob.cs @@ -0,0 +1,567 @@ +using HtmlAgilityPack; +using Quartz; +using System.Globalization; +using System.Linq; +using System.Text.RegularExpressions; +using System.Web; +using TableSpans.HtmlAgilityPack; +using TelebilbaoEpg.Database.Models; +using TelebilbaoEpg.Database.Repository; + +namespace TelebilbaoEpg.Jobs +{ + public class ScrapeJob : IJob + { + private IConfiguration _configuration; + private IBroadCastRepository _broadCastRepository; + + public ScrapeJob(IConfiguration configuration, IBroadCastRepository broadCastRepository) + { + _configuration = configuration; + _broadCastRepository = broadCastRepository; + } + + private List GetTimeBlocks(HtmlNode programTable) + { + var ret = new List(); + + var timeBlocks = programTable.SelectNodes("tbody/tr/td[1]"); + + if (timeBlocks != null) + { + TimeOnly? previousTime = null; + + var index = 0; + var blockIndex = 0; + + foreach (var node in timeBlocks) + { + if (!ret.Any(b => b.RowIndex == index)) + { + + var text = node.InnerText; + + var currentBlock = new TimeBlock() + { + RowIndex = index, + BlockIndex = blockIndex, + }; + + if (!string.IsNullOrEmpty(text)) + { + TimeOnly? parsedValue = null; + + try + { + var sanitizedtext = text.Replace("::", ":"); + parsedValue = TimeOnly.Parse(sanitizedtext); + } + catch (FormatException) + { + var sections = text.Split('.'); + + if (sections.Length == 2) + { + var hourSection = sections[0]; + + var minuteSection = sections[1]; + + if (!string.IsNullOrEmpty(hourSection) && !string.IsNullOrEmpty(minuteSection)) + { + var hour = int.Parse(hourSection); + + var minute = int.Parse(minuteSection); + + parsedValue = new TimeOnly(hour, minute); + } + } + } + + if (parsedValue.HasValue) + { + currentBlock.From = parsedValue.Value; + } + } + + var shouldAdd = !ret.Any(b => b.From > currentBlock.From); + + if (!shouldAdd) + { + //start of day by blocks + var startDay = ret.First(b => b.BlockIndex == 0).From; + + if (currentBlock.From < startDay) + { + shouldAdd = ret.Any(b => currentBlock.From < b.From); + } + } + + shouldAdd = shouldAdd && currentBlock.From.HasValue; + + if (shouldAdd) + { + ret.Add(currentBlock); + + if (previousTime.HasValue) + { + var previousBlock = ret + .OrderByDescending(b => b.RowIndex) + .FirstOrDefault(b => b.From < currentBlock.From); + + if (previousBlock != null) + { + previousBlock.To = currentBlock.From.Value; + } + } + + previousTime = currentBlock.From; + blockIndex++; + } + } + + index++; + } + + var firstBlock = ret.OrderBy(b => b.RowIndex) + .FirstOrDefault(); + + var lastBlock = ret.OrderByDescending(b => b.RowIndex) + .FirstOrDefault(); + + if (firstBlock != null && lastBlock != null && firstBlock.From.HasValue) + { + lastBlock.To = firstBlock.From.Value; + } + } + + return ret; + } + + + public async Task Execute(IJobExecutionContext context) + { + var tableScrapeUrl = _configuration.GetValue("TableScrapeUrl"); + HtmlWeb hw = new HtmlWeb(); + HtmlDocument doc = hw.Load(tableScrapeUrl); + + var tableSpanExtension = new TableSpansExtension(); + + var programTable = tableSpanExtension.ProcessTable(doc.DocumentNode.SelectSingleNode("//table")); + + var timeBlocks = GetTimeBlocks(programTable); + + // week starts at monday + var startOfWeek = DateTime.Now.Date.AddDays(-((int)DateTime.Now.DayOfWeek) + 1); + + var dayColumnStart = 2; + var dayColumnEnd = dayColumnStart + 7; + + + var parsedBroadCasts = new List(); + + var tableRows = programTable.SelectNodes($"tbody/tr"); + + for (int dayIndex = dayColumnStart; dayIndex < dayColumnEnd; dayIndex++) + { + var programBlocks = programTable.SelectNodes($"tbody/tr/td[{dayIndex}]"); + + if (programBlocks != null) + { + var day = startOfWeek.AddDays(dayIndex - dayColumnStart); + + //reset counter + var rowIndex = 0; + + foreach (var programBlock in programBlocks) + { + var currentDay = day; + var columnIndex = dayIndex; + var beginIndex = rowIndex; + var rowSpan = 0; + + var rowPathIndex = programBlock.XPath.IndexOf("/tr"); + var xpath = $"//table/tbody{programBlock.XPath.Substring(rowPathIndex)}"; + var originalNode = doc.DocumentNode.SelectSingleNode(xpath); + + if (originalNode != null) + { + if (originalNode.Attributes.Contains("rowspan")) + { + rowSpan = int.Parse(originalNode.Attributes["rowspan"].Value); + } + } + + var broadCastsToAdd = new List(); + + TimeOnly? startTime = null; + TimeOnly? endTime = null; + + var startBlock = timeBlocks.FirstOrDefault(b => b.RowIndex == beginIndex); + + if (startBlock == null) + { + startBlock = timeBlocks.OrderByDescending(b => b.RowIndex) + .Where(b => b.RowIndex <= rowIndex + 1) + .FirstOrDefault(); + } + + if (startBlock != null) + { + startTime = startBlock.From; + endTime = startBlock.To; + } + + if (startTime.HasValue && endTime.HasValue) + { + if (startTime.Value.Hour < 7 || endTime.Value.Hour < 7) + { + currentDay = currentDay.AddDays(1); + } + + var startDate = currentDay.AddTicks(startTime.Value.Ticks); + var endDate = currentDay.AddTicks(endTime.Value.Ticks); + + var text = HttpUtility.HtmlDecode(programBlock.InnerText); + + string timepattern = "(?:2[0-3]|[01]?[0-9])[:.][0-5]?[0-9]"; + var needsSplitByTimePattern = Regex.IsMatch(text, timepattern); + + var separator = "—"; + var needsSplitBySeparator = text.Contains(separator); + + var needsSplitByHorizontalRow = programBlock.SelectSingleNode("hr") != null; + + if (needsSplitByTimePattern) + { + var match = Regex.Match(text, timepattern); + + if (match.Success) + { + var firstProgramText = text.Substring(0, match.Index); + + var secondProgramText = text.Substring(match.Index + match.Length); + + var splitTime = TimeOnly.Parse(match.Value); + var splitDate = currentDay.AddTicks(splitTime.Ticks); + + + if (!string.IsNullOrEmpty(firstProgramText)) + { + + var firstProgram = new BroadCast() + { + From = startDate, + To = splitDate, + Name = SanitizeText(firstProgramText), + }; + + broadCastsToAdd.Add(firstProgram); + } + + if (!string.IsNullOrEmpty(secondProgramText)) + { + var secondProgram = new BroadCast() + { + From = splitDate, + To = endDate, + Name = SanitizeText(secondProgramText), + }; + broadCastsToAdd.Add(secondProgram); + } + } + } + else if (needsSplitBySeparator) + { + var separatorIndex = text.IndexOf(separator); + + var endBlock = timeBlocks.FirstOrDefault(b => b.RowIndex == beginIndex + rowSpan); + + if (endBlock == null) + { + endBlock = timeBlocks + .OrderByDescending(b => b.RowIndex) + .Where(b => beginIndex + rowSpan > b.RowIndex) + .FirstOrDefault(); + } + + if (endBlock != null && endBlock.To.HasValue) + { + var blockStartDate = startDate; + var blockEndtime = endBlock.To.Value; + var blockEndDate = currentDay.AddTicks(blockEndtime.Ticks); + + var duration = blockEndDate - blockStartDate; + + var splitDate = rowSpan > 0 ? blockStartDate.AddMinutes((int)duration.TotalMinutes / rowSpan) : blockStartDate.AddMinutes((int)duration.Minutes / 2); + + var firstProgramText = string.Empty; + + var secondProgramText = string.Empty; + + if (separatorIndex > 0) + { + firstProgramText = text.Substring(0, separatorIndex); + + secondProgramText = text.Substring(separatorIndex); + } + else + { + secondProgramText = text.Replace(separator, ""); + } + + var firstProgramName = SanitizeText(firstProgramText); + + var secondProgramName = SanitizeText(secondProgramText); + + if (!string.IsNullOrEmpty(firstProgramName)) + { + var firstProgram = new BroadCast() + { + From = startDate, + To = splitDate, + Name = firstProgramName, + }; + + broadCastsToAdd.Add(firstProgram); + } + + if (!string.IsNullOrEmpty(secondProgramName) && splitDate <= endDate) + { + var secondProgram = new BroadCast() + { + From = splitDate, + To = endDate, + Name = secondProgramName, + }; + + broadCastsToAdd.Add(secondProgram); + } + } + } + else if (needsSplitByHorizontalRow) + { + var textNodes = new List(); + + var nodeCollection = programBlock.SelectNodes("strong"); + + if (nodeCollection != null) + { + textNodes.AddRange(nodeCollection.Where(n => !string.IsNullOrEmpty(n.InnerText)).ToList()); + } + + + nodeCollection = programBlock.SelectNodes("p"); + + if (nodeCollection != null) + { + textNodes.AddRange(nodeCollection.Where(n => !string.IsNullOrEmpty(n.InnerText)).ToList()); + } + + var nodeCount = textNodes.Count; + + if (nodeCount > 0) + { + var endBlock = timeBlocks.FirstOrDefault(b => b.RowIndex == beginIndex + rowSpan); + + if (endBlock == null) + { + endBlock = timeBlocks + .OrderByDescending(b => b.RowIndex) + .Where(b => beginIndex + rowSpan > b.RowIndex) + .FirstOrDefault(); + } + + if (endBlock != null && endBlock.To.HasValue) + { + var blockStartDate = startDate; + var blockEndtime = endBlock.To.Value; + var blockEndDate = currentDay.AddTicks(blockEndtime.Ticks); + var duration = blockEndDate - blockStartDate; + + var itemDuration = duration.TotalMinutes / nodeCount; + + for (int i = 0; i < nodeCount; i++) + { + var nodeStartDate = blockStartDate.AddMinutes(i * itemDuration); + var nodeEndDate = nodeStartDate.AddMinutes(itemDuration); + + var node = textNodes[i]; + var nodeText = node.InnerText; + var currentText = SanitizeText(nodeText); + + var currentBroadcast = new BroadCast() + { + From = nodeStartDate, + To = nodeEndDate, + Name = currentText, + }; + + broadCastsToAdd.Add(currentBroadcast); + } + } + } + } + else + { + + var name = SanitizeText(text); + + var previousIndex = rowIndex - 1; + + var broadCast = new BroadCast() + { + From = startDate, + To = endDate, + Name = name, + }; + + broadCastsToAdd.Add(broadCast); + } + + foreach (var item in broadCastsToAdd) + { + var add = !parsedBroadCasts.Any(b => b.To >= item.From && b.Name.Equals(item.Name)) && !parsedBroadCasts.Any(b => b.From == item.From && b.To == item.To); + + if (add) + { + parsedBroadCasts.Add(item); + } + else + { + var broadCastToUpdate = parsedBroadCasts.FirstOrDefault(b => b.To >= item.From && b.Name.Equals(item.Name)); + + if (broadCastToUpdate != null) + { + broadCastToUpdate.To = item.To; + } + } + } + } + + rowIndex++; + } + } + } + + var stationProgramInformationUrl = _configuration.GetValue("StationProgramInformationUrl"); + doc = hw.Load(stationProgramInformationUrl); + + var parsedPrograms = new List(); + + + var titleNodeCollection = doc.DocumentNode.SelectNodes("//h2[contains(@class, 'programa_title')]"); + + if (titleNodeCollection != null) + { + foreach (var titleNode in titleNodeCollection) + { + var title = SanitizeText(titleNode.InnerText); + var description = string.Empty; + var imageUrl = string.Empty; + + var programWrapper = titleNode.ParentNode.ParentNode.ParentNode; + + var imageWrapper = programWrapper.SelectSingleNode("div[contains(@class, 'wpb_single_image')]"); + + if (imageWrapper != null) + { + var imagenode = imageWrapper.SelectSingleNode("figure/div/img"); + + if (imagenode != null) + { + var attributeName = "src"; + imageUrl = imagenode.Attributes.Contains(attributeName) ? imagenode.Attributes[attributeName].Value : imageUrl; + } + } + + var descriptionNode = programWrapper.SelectSingleNode("div[contains(@class, 'vc_row-o-content-bottom')]"); //vc_row-o-content-bottom + + + if (descriptionNode != null) + { + description = SanitizeText(descriptionNode.InnerText); + } + + if (!string.IsNullOrEmpty(title) && !string.IsNullOrEmpty(description)) + { + var program = new ProgramItem + { + Description = description, + Name = title, + ImageUrl = imageUrl, + }; + + parsedPrograms.Add(program); + } + } + } + + foreach(var broadcast in parsedBroadCasts) + { + var program = parsedPrograms.FirstOrDefault(p => p.Name == broadcast.Name); + + if(program != null) + { + broadcast.Description = program.Description; + broadcast.ImageUrl = program.ImageUrl; + } + } + + var startSaveDate = parsedBroadCasts.Min(x => x.From); + var endSaveDate = parsedBroadCasts.Max(x => x.To); + + var savedBroadCasts = _broadCastRepository.GetBroadCasts(DateOnly.FromDateTime(startSaveDate), DateOnly.FromDateTime(endSaveDate)); + + foreach (var broadcast in parsedBroadCasts) + { + var shouldSave = !savedBroadCasts.Any(b => b.From == broadcast.From && b.To == broadcast.To && b.Name == broadcast.Name); + + if (shouldSave) + { + _broadCastRepository.Add(broadcast); + } + } + + + } + private string SanitizeText(string text) + { + var ret = string.Empty; + + if (!string.IsNullOrEmpty(text)) + { + //proper lower and upper case fromatting + ret = CultureInfo.CurrentCulture.TextInfo.ToTitleCase(text.ToLower()).Trim(); + ret = ret.Replace("\n", " ").Replace(" ", " "); + + var separatorIndex = ret.IndexOf("—"); + + if (separatorIndex > -1) + { + ret = ret.Substring(0, separatorIndex).Trim(); + } + } + + return ret; + } + + public class TimeBlock + { + public TimeOnly? From { get; set; } + + public TimeOnly? To { get; set; } + + public int RowIndex { get; set; } + + public int BlockIndex { get; set; } + } + + public class ProgramItem + { + public string Name { get; set; } = string.Empty; + public string Description { get; set; } + + public string ImageUrl { get; set; } + } + } +} diff --git a/TelebilbaoEpg/Program.cs b/TelebilbaoEpg/Program.cs new file mode 100644 index 0000000..bd6b9f8 --- /dev/null +++ b/TelebilbaoEpg/Program.cs @@ -0,0 +1,64 @@ +using Quartz; +using TelebilbaoEpg.Database.Repository; +using TelebilbaoEpg.Jobs; + +var builder = WebApplication.CreateBuilder(args); + +// Add services to the container. + +builder.Services.AddControllers(); +// Learn more about configuring Swagger/OpenAPI at https://aka.ms/aspnetcore/swashbuckle +builder.Services.AddEndpointsApiExplorer(); +builder.Services.AddSwaggerGen(); + +builder.Services.AddQuartz(); +builder.Services.AddQuartzHostedService(opt => +{ + opt.WaitForJobsToComplete = true; +}); + +builder.Logging.ClearProviders(); +builder.Logging.AddConsole(); + +builder.Services.AddScoped(); + +var app = builder.Build(); + +// Configure the HTTP request pipeline. +//if (app.Environment.IsDevelopment()) +//{ + app.UseSwagger(); + app.UseSwaggerUI(); +//} + +app.UseHttpsRedirection(); + +app.UseAuthorization(); + +app.MapControllers(); + +var configuration = app.Configuration; + +string jobSchedule = configuration.GetValue("Quartz:JobSchedule"); + +var schedulerFactory = app.Services.GetRequiredService(); +var scheduler = await schedulerFactory.GetScheduler(); + +// define the job and tie it to our HelloJob class +var job = JobBuilder.Create() + .Build(); + +var trigger = TriggerBuilder.Create() + .WithIdentity("Cron trigger", "Scrape") + .StartNow() + .WithCronSchedule(jobSchedule) + .Build(); + +//var trigger = TriggerBuilder.Create() +// .WithIdentity("Cron trigger", "Scrape") +// .StartNow() +// .Build(); + +await scheduler.ScheduleJob(job, trigger); + +app.Run(); diff --git a/TelebilbaoEpg/Properties/launchSettings.json b/TelebilbaoEpg/Properties/launchSettings.json new file mode 100644 index 0000000..ea9ef91 --- /dev/null +++ b/TelebilbaoEpg/Properties/launchSettings.json @@ -0,0 +1,52 @@ +{ + "profiles": { + "http": { + "commandName": "Project", + "launchBrowser": true, + "launchUrl": "swagger", + "environmentVariables": { + "ASPNETCORE_ENVIRONMENT": "Development" + }, + "dotnetRunMessages": true, + "applicationUrl": "http://localhost:5242" + }, + "https": { + "commandName": "Project", + "launchBrowser": true, + "launchUrl": "swagger", + "environmentVariables": { + "ASPNETCORE_ENVIRONMENT": "Development" + }, + "dotnetRunMessages": true, + "applicationUrl": "https://localhost:7077;http://localhost:5242" + }, + "IIS Express": { + "commandName": "IISExpress", + "launchBrowser": true, + "launchUrl": "swagger", + "environmentVariables": { + "ASPNETCORE_ENVIRONMENT": "Development" + } + }, + "Container (Dockerfile)": { + "commandName": "Docker", + "launchBrowser": true, + "launchUrl": "{Scheme}://{ServiceHost}:{ServicePort}/swagger", + "environmentVariables": { + "ASPNETCORE_HTTPS_PORTS": "8081", + "ASPNETCORE_HTTP_PORTS": "8080" + }, + "publishAllPorts": true, + "useSSL": true + } + }, + "$schema": "http://json.schemastore.org/launchsettings.json", + "iisSettings": { + "windowsAuthentication": false, + "anonymousAuthentication": true, + "iisExpress": { + "applicationUrl": "http://localhost:19838", + "sslPort": 44365 + } + } +} \ No newline at end of file diff --git a/TelebilbaoEpg/TelebilbaoEpg.csproj b/TelebilbaoEpg/TelebilbaoEpg.csproj new file mode 100644 index 0000000..0819adf --- /dev/null +++ b/TelebilbaoEpg/TelebilbaoEpg.csproj @@ -0,0 +1,26 @@ + + + + net8.0 + enable + enable + Telebilbap_Epg + 3104d886-ec84-40b1-8f80-9a0670a7d2f3 + Linux + + + + + + + + + + + + + + + + + diff --git a/TelebilbaoEpg/appsettings.Development.json b/TelebilbaoEpg/appsettings.Development.json new file mode 100644 index 0000000..fa63cfd --- /dev/null +++ b/TelebilbaoEpg/appsettings.Development.json @@ -0,0 +1,13 @@ +{ + "Logging": { + "LogLevel": { + "Default": "Information", + "Microsoft.AspNetCore": "Warning" + }, + "Quartz": { + "JobSchedule": "0 0/1 * * * ?" + }, + "TableScrapeUrl": "https://www.telebilbao.es/programacion/", + "StationProgramInformationUrl": "https://www.telebilbao.es/" + } +} diff --git a/TelebilbaoEpg/appsettings.json b/TelebilbaoEpg/appsettings.json new file mode 100644 index 0000000..0eb250d --- /dev/null +++ b/TelebilbaoEpg/appsettings.json @@ -0,0 +1,14 @@ +{ + "Logging": { + "LogLevel": { + "Default": "Information", + "Microsoft.AspNetCore": "Warning" + } + }, + "AllowedHosts": "*", + "Quartz": { + "JobSchedule": "0 0/30 * * * ?" + }, + "TableScrapeUrl": "https://www.telebilbao.es/programacion/", + "StationProgramInformationUrl": "https://www.telebilbao.es/" +}