add movie scraping and documentation
Some checks failed
Build docker container / build (push) Failing after 7m21s

This commit is contained in:
David Claeys 2024-05-17 10:17:15 +02:00
parent a82d7bd861
commit 87e8700236
8 changed files with 203 additions and 14 deletions

View File

@ -17,6 +17,8 @@ RUN apk update \
&& apk upgrade --available \ && apk upgrade --available \
&& apk add ca-certificates \ && apk add ca-certificates \
&& apk add tzdata \ && apk add tzdata \
&& apk add envsubst \
&& apk add bash \
&& mkdir /config && mkdir -p /usr/local/share/ca-certificates/ && mkdir /config && mkdir -p /usr/local/share/ca-certificates/
COPY --from=build-env /App/out . COPY --from=build-env /App/out .
COPY --from=build-env /config /config COPY --from=build-env /config /config
@ -26,6 +28,10 @@ ENV ASPNETCORE_URLS=http://+:80;https://+:443
ENV ASPNETCORE_Kestrel__Certificates__Default__Path=/usr/local/share/ca-certificates/aspnetapp.crt ENV ASPNETCORE_Kestrel__Certificates__Default__Path=/usr/local/share/ca-certificates/aspnetapp.crt
ENV ASPNETCORE_Kestrel__Certificates__Default__KeyPath=/usr/local/share/ca-certificates/aspnetapp.key ENV ASPNETCORE_Kestrel__Certificates__Default__KeyPath=/usr/local/share/ca-certificates/aspnetapp.key
ENV ASPNETCORE_Kestrel__Certificates__Default__Password=$CERT_PASSWORD ENV ASPNETCORE_Kestrel__Certificates__Default__Password=$CERT_PASSWORD
ENV JOB_SCHEDULE="0 0/30 * * * ?"
ENV MOVIE_API_URL=https://api.themoviedb.org/
ENV MOVIE_API_KEY=""
ENV MOVIE_IMAGE_URL=https://image.tmdb.org
RUN chown -R app:app /App/* \ RUN chown -R app:app /App/* \
&& cp /config/aspnetapp.pem $ASPNETCORE_Kestrel__Certificates__Default__Path \ && cp /config/aspnetapp.pem $ASPNETCORE_Kestrel__Certificates__Default__Path \
&& cp /config/aspnetapp.key $ASPNETCORE_Kestrel__Certificates__Default__KeyPath \ && cp /config/aspnetapp.key $ASPNETCORE_Kestrel__Certificates__Default__KeyPath \
@ -41,8 +47,8 @@ RUN chown -R app:app /App/* \
&& mkdir /data && chmod 755 /data \ && mkdir /data && chmod 755 /data \
&& cat > /data/telebilbaoEpg.db \ && cat > /data/telebilbaoEpg.db \
&& chmod 777 /data/telebilbaoEpg.db \ && chmod 777 /data/telebilbaoEpg.db \
&& chown -R app:app /data/* && chown -R app:app /data/* \
ENTRYPOINT echo "$(envsubst '${MOVIE_API_URL},${MOVIE_API_KEY},${MOVIE_IMAGE_URL},${$JOB_SCHEDULE}' < appsettings.json)" > appsettings.json \
ENTRYPOINT ["dotnet", "TelebilbaoEpg.dll"] && dotnet "TelebilbaoEpg.dll"
EXPOSE 80 EXPOSE 80
EXPOSE 443 EXPOSE 443

View File

@ -22,10 +22,20 @@ Parameters :
- from : start date to get the schedule - from : start date to get the schedule
- to : end date to get the schedule - to : end date to get the schedule
## Pending ## Movie API
For movies the page does not contain any metada or poster.
In order to get this data [TMDB](https://developer.themoviedb.org/reference/intro/getting-started) is used.
In order to get your API key follow the steps on [this](https://developer.themoviedb.org/docs/getting-started) page.
## Docker image
### Environment Variables
| Variable | Description | Default |
|-------------------------------|------------------------------------------------------------------------|-------------------------------|
| JOB_SCHEDULE | Cron expression indicating the scraping recurrence | 0 0/30 * * * ? |
| MOVIE_API_URL | The url to the movie API | https://api.themoviedb.org/ |
| MOVIE_IMAGE_URL | The base url for images on the movie API | https://image.tmdb.org |
| MOVIE_API_KEY | The API key for the API | N/A |
Sometimes movies are emitted on this channel.
The titles are mentioned in Spanish together with the release year.
First the IMDB API was considered to acomplish this but it might not work with Spanish retro titles.
Furthermore the IMDB requires an API token which would make the scraper more difficult to use.
An alternative that supports Spanish and without API tokens needs to be found.

View File

@ -7,6 +7,8 @@ using System.Web;
using TableSpans.HtmlAgilityPack; using TableSpans.HtmlAgilityPack;
using TelebilbaoEpg.Database.Models; using TelebilbaoEpg.Database.Models;
using TelebilbaoEpg.Database.Repository; using TelebilbaoEpg.Database.Repository;
using Telebilbap_Epg.Services;
using static System.Net.Mime.MediaTypeNames;
namespace TelebilbaoEpg.Jobs namespace TelebilbaoEpg.Jobs
{ {
@ -14,11 +16,13 @@ namespace TelebilbaoEpg.Jobs
{ {
private IConfiguration _configuration; private IConfiguration _configuration;
private IBroadCastRepository _broadCastRepository; private IBroadCastRepository _broadCastRepository;
private IMovieService _movieService;
public ScrapeJob(IConfiguration configuration, IBroadCastRepository broadCastRepository) public ScrapeJob(IConfiguration configuration, IBroadCastRepository broadCastRepository, IMovieService movieService)
{ {
_configuration = configuration; _configuration = configuration;
_broadCastRepository = broadCastRepository; _broadCastRepository = broadCastRepository;
_movieService = movieService;
} }
private List<TimeBlock> GetTimeBlocks(HtmlNode programTable) private List<TimeBlock> GetTimeBlocks(HtmlNode programTable)
@ -510,8 +514,45 @@ namespace TelebilbaoEpg.Jobs
var startSaveDate = parsedBroadCasts.Min(x => x.From); var startSaveDate = parsedBroadCasts.Min(x => x.From);
var endSaveDate = parsedBroadCasts.Max(x => x.To); var endSaveDate = parsedBroadCasts.Max(x => x.To);
var movieIndicator = "Cine.";
var movies = parsedBroadCasts.Where(b => b.Name.Contains(movieIndicator))
.ToList();
foreach(var movie in movies)
{
string yearPattern = "(\\d{4})";
var textWithoutIndicator = movie.Name.Replace(movieIndicator, string.Empty).Trim();
var match = Regex.Match(textWithoutIndicator, yearPattern);
int? year = null;
if (match.Success)
{
year = int.Parse(match.Value);
}
var title = textWithoutIndicator;
if (year.HasValue)
{
var yearIndex = textWithoutIndicator.IndexOf(year.Value.ToString());
title = textWithoutIndicator.Substring(0, yearIndex).Replace(".", "").Trim();
}
var foundMovie = await _movieService.GetMovie(title, year);
if (foundMovie != null)
{
movie.Name = foundMovie.Title;
movie.Description = foundMovie.Description;
movie.ImageUrl = foundMovie.ImageUrl;
}
}
var savedBroadCasts = _broadCastRepository.GetBroadCasts(DateOnly.FromDateTime(startSaveDate), DateOnly.FromDateTime(endSaveDate)); var savedBroadCasts = _broadCastRepository.GetBroadCasts(DateOnly.FromDateTime(startSaveDate), DateOnly.FromDateTime(endSaveDate));
foreach (var broadcast in parsedBroadCasts) foreach (var broadcast in parsedBroadCasts)
{ {
var shouldSave = !savedBroadCasts.Any(b => b.From == broadcast.From && b.To == broadcast.To && b.Name == broadcast.Name); var shouldSave = !savedBroadCasts.Any(b => b.From == broadcast.From && b.To == broadcast.To && b.Name == broadcast.Name);

View File

@ -1,6 +1,7 @@
using Quartz; using Quartz;
using TelebilbaoEpg.Database.Repository; using TelebilbaoEpg.Database.Repository;
using TelebilbaoEpg.Jobs; using TelebilbaoEpg.Jobs;
using Telebilbap_Epg.Services;
var builder = WebApplication.CreateBuilder(args); var builder = WebApplication.CreateBuilder(args);
@ -11,6 +12,8 @@ builder.Services.AddControllers();
builder.Services.AddEndpointsApiExplorer(); builder.Services.AddEndpointsApiExplorer();
builder.Services.AddSwaggerGen(); builder.Services.AddSwaggerGen();
builder.Services.AddHttpClient();
builder.Services.AddQuartz(); builder.Services.AddQuartz();
builder.Services.AddQuartzHostedService(opt => builder.Services.AddQuartzHostedService(opt =>
{ {
@ -20,6 +23,7 @@ builder.Services.AddQuartzHostedService(opt =>
builder.Logging.ClearProviders(); builder.Logging.ClearProviders();
builder.Logging.AddConsole(); builder.Logging.AddConsole();
builder.Services.AddScoped<IMovieService, MovieService>();
builder.Services.AddScoped<IBroadCastRepository, BroadCastRepository>(); builder.Services.AddScoped<IBroadCastRepository, BroadCastRepository>();
var app = builder.Build(); var app = builder.Build();

View File

@ -0,0 +1,7 @@
namespace Telebilbap_Epg.Services
{
public interface IMovieService
{
Task<Movie?> GetMovie(string title, int? year);
}
}

View File

@ -0,0 +1,111 @@
using System.Text.Json.Serialization;
namespace Telebilbap_Epg.Services
{
public class MovieService : IMovieService
{
private HttpClient _httpClient;
private IConfiguration _configuration;
public MovieService(HttpClient httpClient, IConfiguration configuration)
{
_httpClient = httpClient;
_configuration = configuration;
}
public async Task<Movie?> GetMovie(string title, int? year)
{
Movie? ret = null;
var apiUrl = _configuration.GetValue<string>("MovieApi:Url");
var apiKey = _configuration.GetValue<string>("MovieApi:ApiKey");
var imageUrl = _configuration.GetValue<string>("MovieApi:ImageUrl");
var queryString = System.Web.HttpUtility.ParseQueryString(string.Empty);
queryString.Add("query", title);
queryString.Add("language", "es");
if (year.HasValue)
{
queryString.Add("year", year.ToString());
}
queryString.Add("api_key", apiKey);
var url = $"{apiUrl}/3/search/movie?{queryString}";
var requestResponse = await _httpClient.GetAsync(url);
var results = await requestResponse.Content.ReadFromJsonAsync<ApiResults>();
if(results != null && results.TotalResults > 0)
{
var firstResult = results.Results.Count > 1 ? results.Results.FirstOrDefault(r => r.Title.ToLower().Equals(title.ToLower())) : results.Results.FirstOrDefault();
if(firstResult != null)
{
DateOnly? releaseDate = null;
try
{
releaseDate = DateOnly
.Parse(firstResult.ReleaseDate);
}
catch (FormatException) { }
var posterPath = string.Empty;
if (!string.IsNullOrEmpty(firstResult.PosterPath))
{
posterPath = $"{imageUrl}/t/p/original{firstResult.PosterPath}";
}
ret = new Movie()
{
ImageUrl = posterPath,
Title = firstResult.Title,
ReleaseDate = releaseDate,
Description = firstResult.Overview,
};
}
}
return ret;
}
}
internal class ApiResults
{
[JsonPropertyName("total_results")]
public int TotalResults { get; set; }
public List<ApiResult> Results { get; set; } = new List<ApiResult>();
}
internal class ApiResult
{
public string Title { get; set; } = string.Empty;
public string Overview { get; set; } = string.Empty;
[JsonPropertyName("poster_path")]
public string PosterPath { get; set; } = string.Empty;
[JsonPropertyName("release_date")]
public string ReleaseDate { get; set; } = string.Empty;
}
public class Movie
{
public string Title { get; set; } = string.Empty;
public string Description { get; set; } = string.Empty;
public DateOnly? ReleaseDate { get; set; } = null;
public string ImageUrl { get; set; } = string.Empty;
}
}

View File

@ -5,9 +5,14 @@
"Microsoft.AspNetCore": "Warning" "Microsoft.AspNetCore": "Warning"
}, },
"Quartz": { "Quartz": {
"JobSchedule": "0 0/1 * * * ?" "JobSchedule": "$JOB_SCHEDULE"
}, },
"TableScrapeUrl": "https://www.telebilbao.es/programacion/", "TableScrapeUrl": "https://www.telebilbao.es/programacion/",
"StationProgramInformationUrl": "https://www.telebilbao.es/" "StationProgramInformationUrl": "https://www.telebilbao.es/",
"MovieApi": {
"Url": "$MOVIE_API_URL",
"ApiKey": "$MOVIE_API_KEY",
"ImageUrl": "$MOVIE_IMAGE_URL"
}
} }
} }

View File

@ -7,8 +7,13 @@
}, },
"AllowedHosts": "*", "AllowedHosts": "*",
"Quartz": { "Quartz": {
"JobSchedule": "0 0/30 * * * ?" "JobSchedule": "$JOB_SCHEDULE"
}, },
"TableScrapeUrl": "https://www.telebilbao.es/programacion/", "TableScrapeUrl": "https://www.telebilbao.es/programacion/",
"StationProgramInformationUrl": "https://www.telebilbao.es/" "StationProgramInformationUrl": "https://www.telebilbao.es/",
"MovieApi": {
"Url": "$MOVIE_API_URL",
"ApiKey": "$MOVIE_API_KEY",
"ImageUrl": "$MOVIE_IMAGE_URL"
}
} }