This commit is contained in:
567
TelebilbaoEpg/Jobs/ScrapeJob.cs
Normal file
567
TelebilbaoEpg/Jobs/ScrapeJob.cs
Normal file
@ -0,0 +1,567 @@
|
||||
using HtmlAgilityPack;
|
||||
using Quartz;
|
||||
using System.Globalization;
|
||||
using System.Linq;
|
||||
using System.Text.RegularExpressions;
|
||||
using System.Web;
|
||||
using TableSpans.HtmlAgilityPack;
|
||||
using TelebilbaoEpg.Database.Models;
|
||||
using TelebilbaoEpg.Database.Repository;
|
||||
|
||||
namespace TelebilbaoEpg.Jobs
|
||||
{
|
||||
public class ScrapeJob : IJob
|
||||
{
|
||||
private IConfiguration _configuration;
|
||||
private IBroadCastRepository _broadCastRepository;
|
||||
|
||||
public ScrapeJob(IConfiguration configuration, IBroadCastRepository broadCastRepository)
|
||||
{
|
||||
_configuration = configuration;
|
||||
_broadCastRepository = broadCastRepository;
|
||||
}
|
||||
|
||||
private List<TimeBlock> GetTimeBlocks(HtmlNode programTable)
|
||||
{
|
||||
var ret = new List<TimeBlock>();
|
||||
|
||||
var timeBlocks = programTable.SelectNodes("tbody/tr/td[1]");
|
||||
|
||||
if (timeBlocks != null)
|
||||
{
|
||||
TimeOnly? previousTime = null;
|
||||
|
||||
var index = 0;
|
||||
var blockIndex = 0;
|
||||
|
||||
foreach (var node in timeBlocks)
|
||||
{
|
||||
if (!ret.Any(b => b.RowIndex == index))
|
||||
{
|
||||
|
||||
var text = node.InnerText;
|
||||
|
||||
var currentBlock = new TimeBlock()
|
||||
{
|
||||
RowIndex = index,
|
||||
BlockIndex = blockIndex,
|
||||
};
|
||||
|
||||
if (!string.IsNullOrEmpty(text))
|
||||
{
|
||||
TimeOnly? parsedValue = null;
|
||||
|
||||
try
|
||||
{
|
||||
var sanitizedtext = text.Replace("::", ":");
|
||||
parsedValue = TimeOnly.Parse(sanitizedtext);
|
||||
}
|
||||
catch (FormatException)
|
||||
{
|
||||
var sections = text.Split('.');
|
||||
|
||||
if (sections.Length == 2)
|
||||
{
|
||||
var hourSection = sections[0];
|
||||
|
||||
var minuteSection = sections[1];
|
||||
|
||||
if (!string.IsNullOrEmpty(hourSection) && !string.IsNullOrEmpty(minuteSection))
|
||||
{
|
||||
var hour = int.Parse(hourSection);
|
||||
|
||||
var minute = int.Parse(minuteSection);
|
||||
|
||||
parsedValue = new TimeOnly(hour, minute);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (parsedValue.HasValue)
|
||||
{
|
||||
currentBlock.From = parsedValue.Value;
|
||||
}
|
||||
}
|
||||
|
||||
var shouldAdd = !ret.Any(b => b.From > currentBlock.From);
|
||||
|
||||
if (!shouldAdd)
|
||||
{
|
||||
//start of day by blocks
|
||||
var startDay = ret.First(b => b.BlockIndex == 0).From;
|
||||
|
||||
if (currentBlock.From < startDay)
|
||||
{
|
||||
shouldAdd = ret.Any(b => currentBlock.From < b.From);
|
||||
}
|
||||
}
|
||||
|
||||
shouldAdd = shouldAdd && currentBlock.From.HasValue;
|
||||
|
||||
if (shouldAdd)
|
||||
{
|
||||
ret.Add(currentBlock);
|
||||
|
||||
if (previousTime.HasValue)
|
||||
{
|
||||
var previousBlock = ret
|
||||
.OrderByDescending(b => b.RowIndex)
|
||||
.FirstOrDefault(b => b.From < currentBlock.From);
|
||||
|
||||
if (previousBlock != null)
|
||||
{
|
||||
previousBlock.To = currentBlock.From.Value;
|
||||
}
|
||||
}
|
||||
|
||||
previousTime = currentBlock.From;
|
||||
blockIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
index++;
|
||||
}
|
||||
|
||||
var firstBlock = ret.OrderBy(b => b.RowIndex)
|
||||
.FirstOrDefault();
|
||||
|
||||
var lastBlock = ret.OrderByDescending(b => b.RowIndex)
|
||||
.FirstOrDefault();
|
||||
|
||||
if (firstBlock != null && lastBlock != null && firstBlock.From.HasValue)
|
||||
{
|
||||
lastBlock.To = firstBlock.From.Value;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
public async Task Execute(IJobExecutionContext context)
|
||||
{
|
||||
var tableScrapeUrl = _configuration.GetValue<string>("TableScrapeUrl");
|
||||
HtmlWeb hw = new HtmlWeb();
|
||||
HtmlDocument doc = hw.Load(tableScrapeUrl);
|
||||
|
||||
var tableSpanExtension = new TableSpansExtension();
|
||||
|
||||
var programTable = tableSpanExtension.ProcessTable(doc.DocumentNode.SelectSingleNode("//table"));
|
||||
|
||||
var timeBlocks = GetTimeBlocks(programTable);
|
||||
|
||||
// week starts at monday
|
||||
var startOfWeek = DateTime.Now.Date.AddDays(-((int)DateTime.Now.DayOfWeek) + 1);
|
||||
|
||||
var dayColumnStart = 2;
|
||||
var dayColumnEnd = dayColumnStart + 7;
|
||||
|
||||
|
||||
var parsedBroadCasts = new List<BroadCast>();
|
||||
|
||||
var tableRows = programTable.SelectNodes($"tbody/tr");
|
||||
|
||||
for (int dayIndex = dayColumnStart; dayIndex < dayColumnEnd; dayIndex++)
|
||||
{
|
||||
var programBlocks = programTable.SelectNodes($"tbody/tr/td[{dayIndex}]");
|
||||
|
||||
if (programBlocks != null)
|
||||
{
|
||||
var day = startOfWeek.AddDays(dayIndex - dayColumnStart);
|
||||
|
||||
//reset counter
|
||||
var rowIndex = 0;
|
||||
|
||||
foreach (var programBlock in programBlocks)
|
||||
{
|
||||
var currentDay = day;
|
||||
var columnIndex = dayIndex;
|
||||
var beginIndex = rowIndex;
|
||||
var rowSpan = 0;
|
||||
|
||||
var rowPathIndex = programBlock.XPath.IndexOf("/tr");
|
||||
var xpath = $"//table/tbody{programBlock.XPath.Substring(rowPathIndex)}";
|
||||
var originalNode = doc.DocumentNode.SelectSingleNode(xpath);
|
||||
|
||||
if (originalNode != null)
|
||||
{
|
||||
if (originalNode.Attributes.Contains("rowspan"))
|
||||
{
|
||||
rowSpan = int.Parse(originalNode.Attributes["rowspan"].Value);
|
||||
}
|
||||
}
|
||||
|
||||
var broadCastsToAdd = new List<BroadCast>();
|
||||
|
||||
TimeOnly? startTime = null;
|
||||
TimeOnly? endTime = null;
|
||||
|
||||
var startBlock = timeBlocks.FirstOrDefault(b => b.RowIndex == beginIndex);
|
||||
|
||||
if (startBlock == null)
|
||||
{
|
||||
startBlock = timeBlocks.OrderByDescending(b => b.RowIndex)
|
||||
.Where(b => b.RowIndex <= rowIndex + 1)
|
||||
.FirstOrDefault();
|
||||
}
|
||||
|
||||
if (startBlock != null)
|
||||
{
|
||||
startTime = startBlock.From;
|
||||
endTime = startBlock.To;
|
||||
}
|
||||
|
||||
if (startTime.HasValue && endTime.HasValue)
|
||||
{
|
||||
if (startTime.Value.Hour < 7 || endTime.Value.Hour < 7)
|
||||
{
|
||||
currentDay = currentDay.AddDays(1);
|
||||
}
|
||||
|
||||
var startDate = currentDay.AddTicks(startTime.Value.Ticks);
|
||||
var endDate = currentDay.AddTicks(endTime.Value.Ticks);
|
||||
|
||||
var text = HttpUtility.HtmlDecode(programBlock.InnerText);
|
||||
|
||||
string timepattern = "(?:2[0-3]|[01]?[0-9])[:.][0-5]?[0-9]";
|
||||
var needsSplitByTimePattern = Regex.IsMatch(text, timepattern);
|
||||
|
||||
var separator = "—";
|
||||
var needsSplitBySeparator = text.Contains(separator);
|
||||
|
||||
var needsSplitByHorizontalRow = programBlock.SelectSingleNode("hr") != null;
|
||||
|
||||
if (needsSplitByTimePattern)
|
||||
{
|
||||
var match = Regex.Match(text, timepattern);
|
||||
|
||||
if (match.Success)
|
||||
{
|
||||
var firstProgramText = text.Substring(0, match.Index);
|
||||
|
||||
var secondProgramText = text.Substring(match.Index + match.Length);
|
||||
|
||||
var splitTime = TimeOnly.Parse(match.Value);
|
||||
var splitDate = currentDay.AddTicks(splitTime.Ticks);
|
||||
|
||||
|
||||
if (!string.IsNullOrEmpty(firstProgramText))
|
||||
{
|
||||
|
||||
var firstProgram = new BroadCast()
|
||||
{
|
||||
From = startDate,
|
||||
To = splitDate,
|
||||
Name = SanitizeText(firstProgramText),
|
||||
};
|
||||
|
||||
broadCastsToAdd.Add(firstProgram);
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(secondProgramText))
|
||||
{
|
||||
var secondProgram = new BroadCast()
|
||||
{
|
||||
From = splitDate,
|
||||
To = endDate,
|
||||
Name = SanitizeText(secondProgramText),
|
||||
};
|
||||
broadCastsToAdd.Add(secondProgram);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (needsSplitBySeparator)
|
||||
{
|
||||
var separatorIndex = text.IndexOf(separator);
|
||||
|
||||
var endBlock = timeBlocks.FirstOrDefault(b => b.RowIndex == beginIndex + rowSpan);
|
||||
|
||||
if (endBlock == null)
|
||||
{
|
||||
endBlock = timeBlocks
|
||||
.OrderByDescending(b => b.RowIndex)
|
||||
.Where(b => beginIndex + rowSpan > b.RowIndex)
|
||||
.FirstOrDefault();
|
||||
}
|
||||
|
||||
if (endBlock != null && endBlock.To.HasValue)
|
||||
{
|
||||
var blockStartDate = startDate;
|
||||
var blockEndtime = endBlock.To.Value;
|
||||
var blockEndDate = currentDay.AddTicks(blockEndtime.Ticks);
|
||||
|
||||
var duration = blockEndDate - blockStartDate;
|
||||
|
||||
var splitDate = rowSpan > 0 ? blockStartDate.AddMinutes((int)duration.TotalMinutes / rowSpan) : blockStartDate.AddMinutes((int)duration.Minutes / 2);
|
||||
|
||||
var firstProgramText = string.Empty;
|
||||
|
||||
var secondProgramText = string.Empty;
|
||||
|
||||
if (separatorIndex > 0)
|
||||
{
|
||||
firstProgramText = text.Substring(0, separatorIndex);
|
||||
|
||||
secondProgramText = text.Substring(separatorIndex);
|
||||
}
|
||||
else
|
||||
{
|
||||
secondProgramText = text.Replace(separator, "");
|
||||
}
|
||||
|
||||
var firstProgramName = SanitizeText(firstProgramText);
|
||||
|
||||
var secondProgramName = SanitizeText(secondProgramText);
|
||||
|
||||
if (!string.IsNullOrEmpty(firstProgramName))
|
||||
{
|
||||
var firstProgram = new BroadCast()
|
||||
{
|
||||
From = startDate,
|
||||
To = splitDate,
|
||||
Name = firstProgramName,
|
||||
};
|
||||
|
||||
broadCastsToAdd.Add(firstProgram);
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(secondProgramName) && splitDate <= endDate)
|
||||
{
|
||||
var secondProgram = new BroadCast()
|
||||
{
|
||||
From = splitDate,
|
||||
To = endDate,
|
||||
Name = secondProgramName,
|
||||
};
|
||||
|
||||
broadCastsToAdd.Add(secondProgram);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (needsSplitByHorizontalRow)
|
||||
{
|
||||
var textNodes = new List<HtmlNode>();
|
||||
|
||||
var nodeCollection = programBlock.SelectNodes("strong");
|
||||
|
||||
if (nodeCollection != null)
|
||||
{
|
||||
textNodes.AddRange(nodeCollection.Where(n => !string.IsNullOrEmpty(n.InnerText)).ToList());
|
||||
}
|
||||
|
||||
|
||||
nodeCollection = programBlock.SelectNodes("p");
|
||||
|
||||
if (nodeCollection != null)
|
||||
{
|
||||
textNodes.AddRange(nodeCollection.Where(n => !string.IsNullOrEmpty(n.InnerText)).ToList());
|
||||
}
|
||||
|
||||
var nodeCount = textNodes.Count;
|
||||
|
||||
if (nodeCount > 0)
|
||||
{
|
||||
var endBlock = timeBlocks.FirstOrDefault(b => b.RowIndex == beginIndex + rowSpan);
|
||||
|
||||
if (endBlock == null)
|
||||
{
|
||||
endBlock = timeBlocks
|
||||
.OrderByDescending(b => b.RowIndex)
|
||||
.Where(b => beginIndex + rowSpan > b.RowIndex)
|
||||
.FirstOrDefault();
|
||||
}
|
||||
|
||||
if (endBlock != null && endBlock.To.HasValue)
|
||||
{
|
||||
var blockStartDate = startDate;
|
||||
var blockEndtime = endBlock.To.Value;
|
||||
var blockEndDate = currentDay.AddTicks(blockEndtime.Ticks);
|
||||
var duration = blockEndDate - blockStartDate;
|
||||
|
||||
var itemDuration = duration.TotalMinutes / nodeCount;
|
||||
|
||||
for (int i = 0; i < nodeCount; i++)
|
||||
{
|
||||
var nodeStartDate = blockStartDate.AddMinutes(i * itemDuration);
|
||||
var nodeEndDate = nodeStartDate.AddMinutes(itemDuration);
|
||||
|
||||
var node = textNodes[i];
|
||||
var nodeText = node.InnerText;
|
||||
var currentText = SanitizeText(nodeText);
|
||||
|
||||
var currentBroadcast = new BroadCast()
|
||||
{
|
||||
From = nodeStartDate,
|
||||
To = nodeEndDate,
|
||||
Name = currentText,
|
||||
};
|
||||
|
||||
broadCastsToAdd.Add(currentBroadcast);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
var name = SanitizeText(text);
|
||||
|
||||
var previousIndex = rowIndex - 1;
|
||||
|
||||
var broadCast = new BroadCast()
|
||||
{
|
||||
From = startDate,
|
||||
To = endDate,
|
||||
Name = name,
|
||||
};
|
||||
|
||||
broadCastsToAdd.Add(broadCast);
|
||||
}
|
||||
|
||||
foreach (var item in broadCastsToAdd)
|
||||
{
|
||||
var add = !parsedBroadCasts.Any(b => b.To >= item.From && b.Name.Equals(item.Name)) && !parsedBroadCasts.Any(b => b.From == item.From && b.To == item.To);
|
||||
|
||||
if (add)
|
||||
{
|
||||
parsedBroadCasts.Add(item);
|
||||
}
|
||||
else
|
||||
{
|
||||
var broadCastToUpdate = parsedBroadCasts.FirstOrDefault(b => b.To >= item.From && b.Name.Equals(item.Name));
|
||||
|
||||
if (broadCastToUpdate != null)
|
||||
{
|
||||
broadCastToUpdate.To = item.To;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
rowIndex++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var stationProgramInformationUrl = _configuration.GetValue<string>("StationProgramInformationUrl");
|
||||
doc = hw.Load(stationProgramInformationUrl);
|
||||
|
||||
var parsedPrograms = new List<ProgramItem>();
|
||||
|
||||
|
||||
var titleNodeCollection = doc.DocumentNode.SelectNodes("//h2[contains(@class, 'programa_title')]");
|
||||
|
||||
if (titleNodeCollection != null)
|
||||
{
|
||||
foreach (var titleNode in titleNodeCollection)
|
||||
{
|
||||
var title = SanitizeText(titleNode.InnerText);
|
||||
var description = string.Empty;
|
||||
var imageUrl = string.Empty;
|
||||
|
||||
var programWrapper = titleNode.ParentNode.ParentNode.ParentNode;
|
||||
|
||||
var imageWrapper = programWrapper.SelectSingleNode("div[contains(@class, 'wpb_single_image')]");
|
||||
|
||||
if (imageWrapper != null)
|
||||
{
|
||||
var imagenode = imageWrapper.SelectSingleNode("figure/div/img");
|
||||
|
||||
if (imagenode != null)
|
||||
{
|
||||
var attributeName = "src";
|
||||
imageUrl = imagenode.Attributes.Contains(attributeName) ? imagenode.Attributes[attributeName].Value : imageUrl;
|
||||
}
|
||||
}
|
||||
|
||||
var descriptionNode = programWrapper.SelectSingleNode("div[contains(@class, 'vc_row-o-content-bottom')]"); //vc_row-o-content-bottom
|
||||
|
||||
|
||||
if (descriptionNode != null)
|
||||
{
|
||||
description = SanitizeText(descriptionNode.InnerText);
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(title) && !string.IsNullOrEmpty(description))
|
||||
{
|
||||
var program = new ProgramItem
|
||||
{
|
||||
Description = description,
|
||||
Name = title,
|
||||
ImageUrl = imageUrl,
|
||||
};
|
||||
|
||||
parsedPrograms.Add(program);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
foreach(var broadcast in parsedBroadCasts)
|
||||
{
|
||||
var program = parsedPrograms.FirstOrDefault(p => p.Name == broadcast.Name);
|
||||
|
||||
if(program != null)
|
||||
{
|
||||
broadcast.Description = program.Description;
|
||||
broadcast.ImageUrl = program.ImageUrl;
|
||||
}
|
||||
}
|
||||
|
||||
var startSaveDate = parsedBroadCasts.Min(x => x.From);
|
||||
var endSaveDate = parsedBroadCasts.Max(x => x.To);
|
||||
|
||||
var savedBroadCasts = _broadCastRepository.GetBroadCasts(DateOnly.FromDateTime(startSaveDate), DateOnly.FromDateTime(endSaveDate));
|
||||
|
||||
foreach (var broadcast in parsedBroadCasts)
|
||||
{
|
||||
var shouldSave = !savedBroadCasts.Any(b => b.From == broadcast.From && b.To == broadcast.To && b.Name == broadcast.Name);
|
||||
|
||||
if (shouldSave)
|
||||
{
|
||||
_broadCastRepository.Add(broadcast);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
private string SanitizeText(string text)
|
||||
{
|
||||
var ret = string.Empty;
|
||||
|
||||
if (!string.IsNullOrEmpty(text))
|
||||
{
|
||||
//proper lower and upper case fromatting
|
||||
ret = CultureInfo.CurrentCulture.TextInfo.ToTitleCase(text.ToLower()).Trim();
|
||||
ret = ret.Replace("\n", " ").Replace(" ", " ");
|
||||
|
||||
var separatorIndex = ret.IndexOf("—");
|
||||
|
||||
if (separatorIndex > -1)
|
||||
{
|
||||
ret = ret.Substring(0, separatorIndex).Trim();
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public class TimeBlock
|
||||
{
|
||||
public TimeOnly? From { get; set; }
|
||||
|
||||
public TimeOnly? To { get; set; }
|
||||
|
||||
public int RowIndex { get; set; }
|
||||
|
||||
public int BlockIndex { get; set; }
|
||||
}
|
||||
|
||||
public class ProgramItem
|
||||
{
|
||||
public string Name { get; set; } = string.Empty;
|
||||
public string Description { get; set; }
|
||||
|
||||
public string ImageUrl { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user