using HtmlAgilityPack; using Quartz; using System.Globalization; using System.Linq; using System.Text.RegularExpressions; using System.Web; using TableSpans.HtmlAgilityPack; using TelebilbaoEpg.Database.Models; using TelebilbaoEpg.Database.Repository; using Telebilbap_Epg.Services; using static System.Net.Mime.MediaTypeNames; namespace TelebilbaoEpg.Jobs { public class ScrapeJob : IJob { private IConfiguration _configuration; private IBroadCastRepository _broadCastRepository; private IMovieService _movieService; public ScrapeJob(IConfiguration configuration, IBroadCastRepository broadCastRepository, IMovieService movieService) { _configuration = configuration; _broadCastRepository = broadCastRepository; _movieService = movieService; } private List GetTimeBlocks(HtmlNode programTable) { var ret = new List(); var timeBlocks = programTable.SelectNodes("tbody/tr/td[1]"); if (timeBlocks != null) { TimeOnly? previousTime = null; var index = 0; var blockIndex = 0; foreach (var node in timeBlocks) { if (!ret.Any(b => b.RowIndex == index)) { var text = node.InnerText; var currentBlock = new TimeBlock() { RowIndex = index, BlockIndex = blockIndex, }; if (!string.IsNullOrEmpty(text)) { TimeOnly? parsedValue = null; try { var sanitizedtext = text.Replace("::", ":"); parsedValue = TimeOnly.Parse(sanitizedtext); } catch (FormatException) { var sections = text.Split('.'); if (sections.Length == 2) { var hourSection = sections[0]; var minuteSection = sections[1]; if (!string.IsNullOrEmpty(hourSection) && !string.IsNullOrEmpty(minuteSection)) { var hour = int.Parse(hourSection); var minute = int.Parse(minuteSection); parsedValue = new TimeOnly(hour, minute); } } } if (parsedValue.HasValue) { currentBlock.From = parsedValue.Value; } } var shouldAdd = !ret.Any(b => b.From > currentBlock.From); if (!shouldAdd) { //start of day by blocks var startDay = ret.First(b => b.BlockIndex == 0).From; if (currentBlock.From < startDay) { shouldAdd = ret.Any(b => currentBlock.From < b.From); } } shouldAdd = shouldAdd && currentBlock.From.HasValue; if (shouldAdd) { ret.Add(currentBlock); if (previousTime.HasValue) { var previousBlock = ret .OrderByDescending(b => b.RowIndex) .FirstOrDefault(b => b.From < currentBlock.From); if (previousBlock != null) { previousBlock.To = currentBlock.From.Value; } } previousTime = currentBlock.From; blockIndex++; } } index++; } var firstBlock = ret.OrderBy(b => b.RowIndex) .FirstOrDefault(); var lastBlock = ret.OrderByDescending(b => b.RowIndex) .FirstOrDefault(); if (firstBlock != null && lastBlock != null && firstBlock.From.HasValue) { lastBlock.To = firstBlock.From.Value; } } return ret; } public async Task Execute(IJobExecutionContext context) { var tableScrapeUrl = _configuration.GetValue("TableScrapeUrl"); HtmlWeb hw = new HtmlWeb(); HtmlDocument doc = hw.Load(tableScrapeUrl); var tableSpanExtension = new TableSpansExtension(); var programTable = tableSpanExtension.ProcessTable(doc.DocumentNode.SelectSingleNode("//table")); var timeBlocks = GetTimeBlocks(programTable); // week starts at monday var startOfWeek = DateTime.Now.Date.AddDays(-((int)DateTime.Now.DayOfWeek) + 1); var dayColumnStart = 2; var dayColumnEnd = dayColumnStart + 7; var parsedBroadCasts = new List(); var tableRows = programTable.SelectNodes($"tbody/tr"); for (int dayIndex = dayColumnStart; dayIndex < dayColumnEnd; dayIndex++) { var programBlocks = programTable.SelectNodes($"tbody/tr/td[{dayIndex}]"); if (programBlocks != null) { var day = startOfWeek.AddDays(dayIndex - dayColumnStart); //reset counter var rowIndex = 0; foreach (var programBlock in programBlocks) { var currentDay = day; var columnIndex = dayIndex; var beginIndex = rowIndex; var rowSpan = 0; var rowPathIndex = programBlock.XPath.IndexOf("/tr"); var xpath = $"//table/tbody{programBlock.XPath.Substring(rowPathIndex)}"; var originalNode = doc.DocumentNode.SelectSingleNode(xpath); if (originalNode != null) { if (originalNode.Attributes.Contains("rowspan")) { rowSpan = int.Parse(originalNode.Attributes["rowspan"].Value); } } var broadCastsToAdd = new List(); TimeOnly? startTime = null; TimeOnly? endTime = null; var startBlock = timeBlocks.FirstOrDefault(b => b.RowIndex == beginIndex); if (startBlock == null) { startBlock = timeBlocks.OrderByDescending(b => b.RowIndex) .Where(b => b.RowIndex <= rowIndex + 1) .FirstOrDefault(); } if (startBlock != null) { startTime = startBlock.From; endTime = startBlock.To; } if (startTime.HasValue && endTime.HasValue) { if (startTime.Value.Hour < 7 || endTime.Value.Hour < 7) { currentDay = currentDay.AddDays(1); } var startDate = currentDay.AddTicks(startTime.Value.Ticks); var endDate = currentDay.AddTicks(endTime.Value.Ticks); var text = HttpUtility.HtmlDecode(programBlock.InnerText); string timepattern = "(?:2[0-3]|[01]?[0-9])[:.][0-5]?[0-9]"; var needsSplitByTimePattern = Regex.IsMatch(text, timepattern); var separator = "—"; var needsSplitBySeparator = text.Contains(separator); var needsSplitByHorizontalRow = programBlock.SelectSingleNode("hr") != null; if (needsSplitByTimePattern) { var match = Regex.Match(text, timepattern); if (match.Success) { var firstProgramText = text.Substring(0, match.Index); var secondProgramText = text.Substring(match.Index + match.Length); var splitTime = TimeOnly.Parse(match.Value); var splitDate = currentDay.AddTicks(splitTime.Ticks); if (!string.IsNullOrEmpty(firstProgramText)) { var firstProgram = new BroadCast() { From = startDate, To = splitDate, Name = SanitizeText(firstProgramText), }; broadCastsToAdd.Add(firstProgram); } if (!string.IsNullOrEmpty(secondProgramText)) { var secondProgram = new BroadCast() { From = splitDate, To = endDate, Name = SanitizeText(secondProgramText), }; broadCastsToAdd.Add(secondProgram); } } } else if (needsSplitBySeparator) { var separatorIndex = text.IndexOf(separator); var endBlock = timeBlocks.FirstOrDefault(b => b.RowIndex == beginIndex + rowSpan); if (endBlock == null) { endBlock = timeBlocks .OrderByDescending(b => b.RowIndex) .Where(b => beginIndex + rowSpan > b.RowIndex) .FirstOrDefault(); } if (endBlock != null && endBlock.To.HasValue) { var blockStartDate = startDate; var blockEndtime = endBlock.To.Value; var blockEndDate = currentDay.AddTicks(blockEndtime.Ticks); var duration = blockEndDate - blockStartDate; var splitDate = rowSpan > 0 ? blockStartDate.AddMinutes((int)duration.TotalMinutes / rowSpan) : blockStartDate.AddMinutes((int)duration.Minutes / 2); var firstProgramText = string.Empty; var secondProgramText = string.Empty; if (separatorIndex > 0) { firstProgramText = text.Substring(0, separatorIndex); secondProgramText = text.Substring(separatorIndex); } else { secondProgramText = text.Replace(separator, ""); } var firstProgramName = SanitizeText(firstProgramText); var secondProgramName = SanitizeText(secondProgramText); if (!string.IsNullOrEmpty(firstProgramName)) { var firstProgram = new BroadCast() { From = startDate, To = splitDate, Name = firstProgramName, }; broadCastsToAdd.Add(firstProgram); } if (!string.IsNullOrEmpty(secondProgramName) && splitDate <= endDate) { var secondProgram = new BroadCast() { From = splitDate, To = endDate, Name = secondProgramName, }; broadCastsToAdd.Add(secondProgram); } } } else if (needsSplitByHorizontalRow) { var textNodes = new List(); var nodeCollection = programBlock.SelectNodes("strong"); if (nodeCollection != null) { textNodes.AddRange(nodeCollection.Where(n => !string.IsNullOrEmpty(n.InnerText)).ToList()); } nodeCollection = programBlock.SelectNodes("p"); if (nodeCollection != null) { textNodes.AddRange(nodeCollection.Where(n => !string.IsNullOrEmpty(n.InnerText)).ToList()); } var nodeCount = textNodes.Count; if (nodeCount > 0) { var endBlock = timeBlocks.FirstOrDefault(b => b.RowIndex == beginIndex + rowSpan); if (endBlock == null) { endBlock = timeBlocks .OrderByDescending(b => b.RowIndex) .Where(b => beginIndex + rowSpan > b.RowIndex) .FirstOrDefault(); } if (endBlock != null && endBlock.To.HasValue) { var blockStartDate = startDate; var blockEndtime = endBlock.To.Value; var blockEndDate = currentDay.AddTicks(blockEndtime.Ticks); var duration = blockEndDate - blockStartDate; var itemDuration = duration.TotalMinutes / nodeCount; for (int i = 0; i < nodeCount; i++) { var nodeStartDate = blockStartDate.AddMinutes(i * itemDuration); var nodeEndDate = nodeStartDate.AddMinutes(itemDuration); var node = textNodes[i]; var nodeText = node.InnerText; var currentText = SanitizeText(nodeText); var currentBroadcast = new BroadCast() { From = nodeStartDate, To = nodeEndDate, Name = currentText, }; broadCastsToAdd.Add(currentBroadcast); } } } } else { var name = SanitizeText(text); var previousIndex = rowIndex - 1; var broadCast = new BroadCast() { From = startDate, To = endDate, Name = name, }; broadCastsToAdd.Add(broadCast); } foreach (var item in broadCastsToAdd) { var add = !parsedBroadCasts.Any(b => b.To >= item.From && b.Name.Equals(item.Name)) && !parsedBroadCasts.Any(b => b.From == item.From && b.To == item.To); if (add) { parsedBroadCasts.Add(item); } else { var broadCastToUpdate = parsedBroadCasts.FirstOrDefault(b => b.To >= item.From && b.Name.Equals(item.Name)); if (broadCastToUpdate != null) { broadCastToUpdate.To = item.To; } } } } rowIndex++; } } } var stationProgramInformationUrl = _configuration.GetValue("StationProgramInformationUrl"); doc = hw.Load(stationProgramInformationUrl); var parsedPrograms = new List(); var titleNodeCollection = doc.DocumentNode.SelectNodes("//h2[contains(@class, 'programa_title')]"); if (titleNodeCollection != null) { foreach (var titleNode in titleNodeCollection) { var title = SanitizeText(titleNode.InnerText); var description = string.Empty; var imageUrl = string.Empty; var programWrapper = titleNode.ParentNode.ParentNode.ParentNode; var imageWrapper = programWrapper.SelectSingleNode("div[contains(@class, 'wpb_single_image')]"); if (imageWrapper != null) { var imagenode = imageWrapper.SelectSingleNode("figure/div/img"); if (imagenode != null) { var attributeName = "src"; imageUrl = imagenode.Attributes.Contains(attributeName) ? imagenode.Attributes[attributeName].Value : imageUrl; } } var descriptionNode = programWrapper.SelectSingleNode("div[contains(@class, 'vc_row-o-content-bottom')]"); //vc_row-o-content-bottom if (descriptionNode != null) { description = SanitizeText(descriptionNode.InnerText); } if (!string.IsNullOrEmpty(title) && !string.IsNullOrEmpty(description)) { var program = new ProgramItem { Description = description, Name = title, ImageUrl = imageUrl, }; parsedPrograms.Add(program); } } } foreach(var broadcast in parsedBroadCasts) { var program = parsedPrograms.FirstOrDefault(p => p.Name == broadcast.Name); if(program != null) { broadcast.Description = program.Description; broadcast.ImageUrl = program.ImageUrl; } } var startSaveDate = parsedBroadCasts.Min(x => x.From); var endSaveDate = parsedBroadCasts.Max(x => x.To); var movieIndicator = "Cine."; var movies = parsedBroadCasts.Where(b => b.Name.Contains(movieIndicator)) .ToList(); foreach(var movie in movies) { string yearPattern = "(\\d{4})"; var textWithoutIndicator = movie.Name.Replace(movieIndicator, string.Empty).Trim(); var match = Regex.Match(textWithoutIndicator, yearPattern); int? year = null; if (match.Success) { year = int.Parse(match.Value); } var title = textWithoutIndicator; if (year.HasValue) { var yearIndex = textWithoutIndicator.IndexOf(year.Value.ToString()); title = textWithoutIndicator.Substring(0, yearIndex).Replace(".", "").Trim(); } var foundMovie = await _movieService.GetMovie(title, year); if (foundMovie != null) { movie.Name = foundMovie.Title; movie.Description = foundMovie.Description; movie.ImageUrl = foundMovie.ImageUrl; } } var savedBroadCasts = _broadCastRepository.GetBroadCasts(DateOnly.FromDateTime(startSaveDate), DateOnly.FromDateTime(endSaveDate)); foreach (var broadcast in parsedBroadCasts) { var shouldSave = !savedBroadCasts.Any(b => b.From == broadcast.From && b.To == broadcast.To && b.Name == broadcast.Name); if (shouldSave) { _broadCastRepository.Add(broadcast); } } } private string SanitizeText(string text) { var ret = string.Empty; if (!string.IsNullOrEmpty(text)) { //proper lower and upper case fromatting ret = CultureInfo.CurrentCulture.TextInfo.ToTitleCase(text.ToLower()).Trim(); ret = ret.Replace("\n", " ").Replace(" ", " "); var separatorIndex = ret.IndexOf("—"); if (separatorIndex > -1) { ret = ret.Substring(0, separatorIndex).Trim(); } } return ret; } public class TimeBlock { public TimeOnly? From { get; set; } public TimeOnly? To { get; set; } public int RowIndex { get; set; } public int BlockIndex { get; set; } } public class ProgramItem { public string Name { get; set; } = string.Empty; public string Description { get; set; } public string ImageUrl { get; set; } } } }