Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

extends RegexParse plugin #1096

Merged
merged 6 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 108 additions & 42 deletions lib/LANraragi/Plugin/Metadata/RegexParse.pm
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,25 @@ use warnings;
#Plugins can freely use all Perl packages already installed on the system
#Try however to restrain yourself to the ones already installed for LRR (see tools/cpanfile) to avoid extra installations by the end-user.
use File::Basename;
use Scalar::Util qw(looks_like_number);

#You can also use the LRR Internal API when fitting.
use LANraragi::Model::Plugins;
use LANraragi::Utils::Database qw(redis_encode redis_decode);
use LANraragi::Utils::Logging qw(get_plugin_logger);
use LANraragi::Utils::String qw(trim);
use LANraragi::Utils::Logging qw(get_plugin_logger);
use LANraragi::Utils::String qw(trim);
use Scalar::Util qw(looks_like_number);

my $PLUGIN_TAG_NS = 'parsed:';

my %COMMON_EXTRANEOUS_VALUES = (
'uncensored' => 1,
'decensored' => 1,
'ongoing' => 1,
'pixiv' => 1,
'twitter' => 1,
'fanbox' => 1,
'cosplay' => 1,
'digital' => 1
);

#Meta-information about your plugin.
sub plugin_info {
Expand All @@ -23,34 +35,58 @@ sub plugin_info {
type => "metadata",
namespace => "regexplugin",
author => "Difegue",
version => "1.0",
description =>
"Derive tags from the filename of the given archive. <br>Follows the doujinshi naming standard (Release) [Artist] TITLE (Series) [Language].",
version => "1.2",
description => "Derive tags from the filename of the given archive.<br><br>"
. "By default it follows the doujinshi naming standard \"(Release) [Artist] TITLE (Series) [Language]\".<br><br>"
. "Instead, by activating the plugin settings below, you can extend the capture to the content of each bracket in"
. " the filename, even if it does not belong to the standard naming format.<br>"
. "Non-standard tags will be made available to you associated with the \"<i>${PLUGIN_TAG_NS}</i>\" namespace so"
. " you can manage them as you please by creating your own set of Tag Rules.<br>"
. "My only suggestion is that you should place the rule \"<i>-${PLUGIN_TAG_NS}*</i>\" as your last rule to cleanup all the unnecessary elements.",
icon =>
"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAYAAACNiR0NAAAAAXNSR0IArs4c6QAAAL1JREFUOI1jZMABpNbH/sclx8DAwPAscDEjNnEMQUIGETIYhUOqYdgMhTPINQzdUEZqGIZsKBM1DEIGTOiuexqwCKdidDl0vtT62P9kuZCJEWuKYWBgYGBgRHbh04BFDNIb4jAUbbSrZTARUkURg6lD10OUC/0PNaMYgs1Skgwk1jCSDCQWoBg46dYmhite0+D8pwGLCMY6uotRDOy8toZBkI2HIhcO/pxCm8KBUkOxFl/kGoq3gCXFYFxVAACeoU/8xSNybwAAAABJRU5ErkJggg==",
parameters => [ ]
parameters => [
{ type => "bool",
desc =>
"If the filename ends with a pair of curly braces, return the contents inside them as a list of simple tags, without the \"${PLUGIN_TAG_NS}\" namespace"
},
{ type => "bool",
desc =>
"Capture everything you find between a pair of parentheses and make it available under the \"${PLUGIN_TAG_NS}\" namespace<BR />"
. "(use this in conjunction with Tag Rules)"
}
],
);

}

#Mandatory function to be implemented by your plugin
sub get_tags {
my ( undef, $lrr_info, $check_trailing_tags, $keep_all_captures ) = @_;

shift;
my $lrr_info = shift; # Global info hash
# lrr_info's file_path is taken straight from the filesystem, which might not be proper UTF-8.
# Run a decode to make sure we can derive tags with the proper encoding.
my $file = Mojo::File->new( redis_decode( $lrr_info->{'file_path'} ) );
my $filename = $file->basename( '.' . $file->extname );

my ( $tags, $title ) = parse_filename(
$filename,
{ 'check_trailing_tags' => $check_trailing_tags,
'keep_all_captures' => $keep_all_captures
}
);

my $logger = get_plugin_logger();
my $file = $lrr_info->{file_path};
$logger->info("Sending the following tags to LRR: $tags");
$logger->info("Parsed title is $title");

# lrr_info's file_path is taken straight from the filesystem, which might not be proper UTF-8.
# Run a decode to make sure we can derive tags with the proper encoding.
$file = redis_decode($file);
return ( tags => $tags, title => $title );
}

# Get the filename from the file_path info field
my ( $filename, $filepath, $suffix ) = fileparse( $file, qr/\.[^.]*/ );
sub parse_filename {
my ( $filename, $params ) = @_;

my ( $event, $artist, $title, $series, $language );
$event = $artist = $title = $series = $language = "";
my ( $event, $artist, $title, $series, $language, $trailing_tags, $other_captures );

#Replace underscores with spaces
$filename =~ s/_/ /g;
Expand All @@ -64,42 +100,71 @@ sub get_tags {
if ( defined $5 ) { $title = trim($5); }
if ( defined $7 ) { $series = $7; }
if ( defined $9 ) { $language = $9; }
my $tail = trim( $+{'tail'} );

my @tags = ();

if ( $event ne "" ) {
push @tags, "event:$event";
}
if ($tail) {

if ( $artist ne "" ) {
# match trailing_tags (...{Tags}.ext)
if ( $params->{'check_trailing_tags'} ) {
$tail =~ /(?<head>.*)(\{(?<ttags>[^\}]*)\})$/;
$trailing_tags = $+{'ttags'};
$tail = $+{'head'} if ($trailing_tags);
}

#Special case for circle/artist sets:
#If the string contains parenthesis, what's inside those is the artist name
#the rest is the circle.
if ( $artist =~ /(.*) \((.*)\)/ ) {
push @tags, "group:$1";
push @tags, "artist:$2";
} else {
push @tags, "artist:$artist";
# match any remaining parenthesis
if ( $tail && $params->{'keep_all_captures'} ) {
my @items = ( $tail =~ /\(([^\)]+)\)|\{([^}]+)\}|\[([^\]]+)\]/g );
$other_captures = join( ',', grep { trim($_) } @items );
}
}

if ( $series ne "" ) {
push @tags, "series:$series";
my @tags;

push @tags, parse_artist_value($artist) if ($artist);
push @tags, "event:$event" if ($event);
push @tags, parse_captured_value_for_namespace( $language, 'language:' ) if ($language);
push @tags, parse_captured_value_for_namespace( $series, 'series:' ) if ($series);
push @tags, parse_captured_value_for_namespace( $other_captures, $PLUGIN_TAG_NS ) if ($other_captures);
push @tags, parse_captured_value_for_namespace( $trailing_tags, '' ) if ($trailing_tags);

if ( !$params->{'keep_all_captures'} ) {
@tags = grep { !m/^\Q$PLUGIN_TAG_NS/ } @tags;
}

# Don't push numbers as tags for language.
unless ( $language eq "" || looks_like_number($language) ) {
push @tags, "language:$language";
return ( join( ", ", sort @tags ), trim($title) );
}

sub parse_artist_value {
my ($artist) = @_;

my @tags;

#Special case for circle/artist sets:
#If the string contains parenthesis, what's inside those is the artist name
#the rest is the circle.
if ( $artist =~ /(.*) \((.*)\)/ ) {
push @tags, "group:$1"; # split group?
$artist = $2;
}
push @tags, parse_captured_value_for_namespace( $artist, 'artist:' );

my $tagstring = join( ", ", @tags );
return @tags;
}

$logger->info("Sending the following tags to LRR: $tagstring");
sub parse_captured_value_for_namespace {
my ( $capture, $namespace ) = @_;
return map { _classify_item( trim($_), $namespace ) } split( m/,/, $capture );
}

$logger->info("Parsed title is $title");
return ( tags => $tagstring, title => $title );
sub _classify_item {
my ( $item, $namespace ) = @_;

# if the namespace is specified, we are able to exclude some common words,
# otherwise we are dealing with simple tags
if ( $namespace && $COMMON_EXTRANEOUS_VALUES{ lc $item } || looks_like_number($item) ) {
return $PLUGIN_TAG_NS . $item;
}
return "${namespace}${item}";
}

#Regular Expression matching the E-Hentai standard: (Release) [Artist] TITLE (Series) [Language]
Expand All @@ -114,8 +179,9 @@ sub get_tags {
#([^([]+) returns the title. Mandatory.
#(\(([^([)]+)\))? returns the content of (Series). Optional.
#(\[([^]]+)\])? returns the content of [Language]. Optional.
#(?<tail>.*)? returns everything that is out of E-Hentai standard for further processing. Optional.
#\s* indicates zero or more whitespaces.
my $regex = qr/(\(([^([]+)\))?\s*(\[([^]]+)\])?\s*([^([]+)\s*(\(([^([)]+)\))?\s*(\[([^]]+)\])?/;
my $regex = qr/(\(([^([]+)\))?\s*(\[([^]]+)\])?\s*([^([]+)\s*(\(([^([)]+)\))?\s*(\[([^]]+)\])?(?<tail>.*)?/;
sub get_regex { return $regex }

1;
139 changes: 132 additions & 7 deletions tests/LANraragi/Plugin/Metadata/RegexParse.t
Original file line number Diff line number Diff line change
@@ -1,26 +1,151 @@
use strict;
use warnings;
use utf8;

use Cwd qw( getcwd );

use Test::More;

my $cwd = getcwd();
my $cwd = getcwd();
require "$cwd/tests/mocks.pl";

use_ok('LANraragi::Plugin::Metadata::RegexParse');

my %PARAMS_EH_STANDARD = (
'check_trailing_tags' => 0,
'keep_all_captures' => 0,
);
my %PARAMS_KEEP_ALL = (
'check_trailing_tags' => 1,
'keep_all_captures' => 1,
);
my %SKIP_TRAILING_TAGS = ( 'check_trailing_tags' => 0 );

note("testing basic example");
{
no warnings 'once', 'redefine';
local *LANraragi::Plugin::Metadata::RegexParse::get_plugin_logger = sub { return get_logger_mock(); };
local *LANraragi::Plugin::Metadata::RegexParse::get_plugin_logger = sub { return get_logger_mock(); };

my %response =
LANraragi::Plugin::Metadata::RegexParse::get_tags( "",
{ file_path => "/poopoo/peepee/(NoNe) [Yanyanyo (Yanyo)] Reijo no Rei no... (Blue Archive) [English] [Digital].zip" },
1, 1 );

is( $response{title}, "Reijo no Rei no...", 'title' );
is( $response{tags}, "artist:Yanyo, event:NoNe, group:Yanyanyo, language:English, parsed:Digital, series:Blue Archive",
'tag list' );
}

my $filename = '(NoNe) [Yanyanyo (Yanyo)] Reijo no Rei no... (ongoing) [Decensored]';
note("parsing filename > $filename ...");
{
my ( $tags, $title ) =
LANraragi::Plugin::Metadata::RegexParse::parse_filename( $filename, { %PARAMS_KEEP_ALL, ( 'keep_all_captures' => 0 ) } );
is( $tags, 'artist:Yanyo, event:NoNe, group:Yanyanyo', 'tag list' );
is( $title, 'Reijo no Rei no...', 'title' );

( $tags, $title ) = LANraragi::Plugin::Metadata::RegexParse::parse_filename( $filename, \%PARAMS_KEEP_ALL );
is( $tags, 'artist:Yanyo, event:NoNe, group:Yanyanyo, parsed:Decensored, parsed:ongoing', 'full tag list' );
}

my %get_tags_params = ( file_path => "/poopoo/peepee/(Release) [Artist] TITLE (Series) [Language].arj" );
$filename = '(NoNe) [Yanyanyo (Yanyo)] Reijo no Rei no... (Blue Archive) [English] [Digital]';
note("parsing filename > $filename ...");
{
my ( $tags, $title ) = LANraragi::Plugin::Metadata::RegexParse::parse_filename( $filename, \%PARAMS_KEEP_ALL );
is( $tags, 'artist:Yanyo, event:NoNe, group:Yanyanyo, language:English, parsed:Digital, series:Blue Archive', 'tag list' );
is( $title, 'Reijo no Rei no...', 'title' );
}

$filename = '(NoNe) [Yanyanyo (Yanyo)] Reijo no Rei no... (Blue Archive) [Decensored]';
note("parsing filename > $filename ...");
{
my ( $tags, $title ) = LANraragi::Plugin::Metadata::RegexParse::parse_filename( $filename, \%PARAMS_KEEP_ALL );
is( $tags, 'artist:Yanyo, event:NoNe, group:Yanyanyo, parsed:Decensored, series:Blue Archive', 'tag list' );
is( $title, 'Reijo no Rei no...', 'title' );
}

$filename = '(NoNe) [Yanyanyo (Yanyo)] Reijo no Rei no... (Blue Archive) [Eng] [Uncensored]';
note("parsing filename > $filename ...");
{
my ( $tags, $title ) = LANraragi::Plugin::Metadata::RegexParse::parse_filename( $filename, \%PARAMS_KEEP_ALL );
is( $tags, 'artist:Yanyo, event:NoNe, group:Yanyanyo, language:Eng, parsed:Uncensored, series:Blue Archive', 'tag list' );
is( $title, 'Reijo no Rei no...', 'title' );
}

$filename = '(NoNe) [Yanyo] Reijo no Rei no... (Blue Archive) [En]';
note("parsing filename > $filename ...");
{
my ( $tags, $title ) = LANraragi::Plugin::Metadata::RegexParse::parse_filename( $filename, \%PARAMS_KEEP_ALL );
is( $tags, 'artist:Yanyo, event:NoNe, language:En, series:Blue Archive', 'tag list' );
is( $title, 'Reijo no Rei no...', 'title' );
}

my %response = LANraragi::Plugin::Metadata::RegexParse::get_tags( "", \%get_tags_params );
is( $response{title}, "TITLE", "Title was misparsed" );
is( $response{tags}, "event:Release, artist:Artist, series:Series, language:Language", "Wrong tags received" );
$filename = '[Yanyo] Reijo no Rei no... [english]';
note("parsing filename > $filename ...");
{
my ( $tags, $title ) = LANraragi::Plugin::Metadata::RegexParse::parse_filename( $filename, \%PARAMS_KEEP_ALL );
is( $tags, 'artist:Yanyo, language:english', 'tag list' );
is( $title, 'Reijo no Rei no...', 'title' );
}

$filename = '[Yanyo] Reijo no Rei no... [english] {Team} Cap.01 (Digital) [Ongoing] [ ] () { } {big breasts, sole female}';
note("parsing filename > $filename ...");
{
my ( $tags, $title ) = LANraragi::Plugin::Metadata::RegexParse::parse_filename( $filename, \%PARAMS_KEEP_ALL );
is( $tags,
'artist:Yanyo, big breasts, language:english, parsed:Digital, parsed:Ongoing, parsed:Team, sole female',
'tag list with all captures and the last curly brackets as simple tags'
);

( $tags, $title ) =
LANraragi::Plugin::Metadata::RegexParse::parse_filename( $filename, { %PARAMS_KEEP_ALL, %SKIP_TRAILING_TAGS } );
is( $tags,
'artist:Yanyo, language:english, parsed:Digital, parsed:Ongoing, parsed:Team, parsed:big breasts, parsed:sole female',
'tag list with all captures'
);

( $tags, $title ) = LANraragi::Plugin::Metadata::RegexParse::parse_filename( $filename, \%PARAMS_EH_STANDARD );
is( $tags, 'artist:Yanyo, language:english', 'tag list with only EH standard tags' );
}

$filename = '[黒ねずみいぬ, 市川和秀, 猪去バンセ, カサイこーめい, きしぐま, SUV, 重丸しげる, ちんぱん☆Mk-Ⅱ, ばんじゃく, 英, ふぁい, 水樹 凱, やさごり] So many artists!';
note("parsing filename > $filename ...");
{
my ( $tags, $title ) = LANraragi::Plugin::Metadata::RegexParse::parse_filename( $filename, \%PARAMS_KEEP_ALL );
is( $tags,
'artist:きしぐま, artist:ちんぱん☆Mk-Ⅱ, artist:ばんじゃく, artist:ふぁい, artist:やさごり, artist:カサイこーめい, artist:市川和秀, artist:水樹 凱, artist:猪去バンセ, artist:英, artist:重丸しげる, artist:黒ねずみいぬ, artist:SUV',
'tag list'
);
is( $title, 'So many artists!', 'title' );
}

$filename = '(C24) [Atomic Diver Henshuubu (Tajima Shinobu, Yoko)] ART DANGER II (Various) [En,Textless]';
note("parsing filename > $filename ...");
{
my ( $tags, $title ) = LANraragi::Plugin::Metadata::RegexParse::parse_filename( $filename, \%PARAMS_KEEP_ALL );
is( $tags,
'artist:Tajima Shinobu, artist:Yoko, event:C24, group:Atomic Diver Henshuubu, language:En, language:Textless, series:Various',
'tag list'
);
is( $title, 'ART DANGER II', 'title' );
}

$filename = '[Crimson Comics (Crimson)] J-Girl. Ecstasy (Black Cat, D.Gray-man, MX0, To Love-Ru) [English]';
note("parsing filename > $filename ...");
{
my ( $tags, $title ) = LANraragi::Plugin::Metadata::RegexParse::parse_filename( $filename, \%PARAMS_KEEP_ALL );
is( $tags,
'artist:Crimson, group:Crimson Comics, language:English, series:Black Cat, series:D.Gray-man, series:MX0, series:To Love-Ru',
'tag list'
);
is( $title, 'J-Girl. Ecstasy', 'title' );
}

$filename = '[Pixiv] 佐々(66526024) 2024.10.19';
note("parsing filename > $filename ...");
{
my ( $tags, $title ) = LANraragi::Plugin::Metadata::RegexParse::parse_filename( $filename, \%PARAMS_KEEP_ALL );
is( $tags, 'parsed:66526024, parsed:Pixiv', 'tag list' );
is( $title, '佐々', 'not a title, but meh...' );
}

done_testing();