Skip to content

Commit

Permalink
Improved SmartTitleComparator and MovieUtils.parseTitle
Browse files Browse the repository at this point in the history
  • Loading branch information
REDNBLACK committed May 7, 2016
1 parent b41201c commit 9df9980
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 76 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,17 @@

import com.google.common.collect.ImmutableList;
import lombok.NonNull;
import lombok.val;
import org.apache.commons.lang3.StringUtils;
import org.f0w.k2i.core.comparator.AbstractMovieComparator;
import org.f0w.k2i.core.model.entity.Movie;
import org.f0w.k2i.core.util.string.NumericToWord;
import org.f0w.k2i.core.util.string.Translit;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.*;
import java.util.stream.Collectors;

import static com.google.common.base.CharMatcher.*;
import static org.apache.commons.lang3.StringEscapeUtils.unescapeXml;
import static org.apache.commons.lang3.StringUtils.*;

Expand All @@ -28,11 +28,9 @@ public final class SmartTitleComparator extends AbstractMovieComparator {
// Original string
list.add(s -> s);

// Original string without commas
list.add(s -> replaceChars(s, ",", ""));

// Original string without colon
list.add(s -> replaceChars(s, ":", ""));
// Original string without one of symbols
val symbolsToRemove = Arrays.asList(",", ":", "-", " ");
symbolsToRemove.forEach(symbol -> list.add(s -> replace(s, symbol, "")));

// Original string without apostrophes and quotes
list.add(s -> {
Expand All @@ -46,28 +44,27 @@ public final class SmartTitleComparator extends AbstractMovieComparator {
// Original string with unescaped XML symbols and removed foreign accents
list.add(s -> stripAccents(unescapeXml(s)));

// Original string with all whitespace characters replaced with plain backspace
list.add(StringUtils::normalizeSpace);

// Original string without special symbols like unicode etc
list.add(s -> s.replaceAll("/\\\\u([0-9a-z]{4})/", ""));
list.add(s -> javaLetterOrDigit().or(WHITESPACE).or(isNot('.')).precomputed().retainFrom(s));

// Original string with part before dash symbol
list.add(s -> Arrays.stream(splitByWholeSeparator(s, "-"))
// Original string with part before one of separating symbols
val separatingSymbols = Collections.singletonList("-");
separatingSymbols.forEach(separator -> list.add(s -> Arrays.stream(splitByWholeSeparator(s, separator))
.findFirst()
.map(String::trim)
.orElse(s)
);
));

// Original string with part after dash symbol
list.add(s -> Arrays.stream(splitByWholeSeparator(s, "-"))
.reduce((s1, s2) -> s2)
.map(String::trim)
.orElse(s)
);
// One of the prefixes + original string
val prefixes = Collections.singletonList("The ");
prefixes.forEach(p -> list.add(s -> p + s));

// The + Original string
list.add(s -> "The " + s);

// Original string with all whitespace characters replaced with plain backspace
list.add(s -> s.replaceAll("\\s+", " "));
// Original string + one of postfixes
val postfixes = Collections.singletonList("u");
postfixes.forEach(p -> list.add(s -> s + p));

// Original string with XML symbols replaced with backspace
list.add(s -> {
Expand All @@ -84,9 +81,6 @@ public final class SmartTitleComparator extends AbstractMovieComparator {
// Weakly transliterated string
list.add(Translit::toWeakerTranslit);

// Weakly transliterated with lower case and capitalized
list.add(s -> Translit.toWeakerTranslit(capitalize(s.toLowerCase())));

// Original string with numeric replaced to text representation
list.add(s -> Arrays.stream(splitByWholeSeparator(s, null))
.map(n -> {
Expand All @@ -100,12 +94,25 @@ public final class SmartTitleComparator extends AbstractMovieComparator {
);

// Modifiers using symbols mix
List<String> symbolsMix = Arrays.asList("&", "and", "et");
symbolsMix.forEach(s1 -> symbolsMix.forEach(s2 -> {
if (!s1.equals(s2)) {
list.add(s -> replace(s, s1, s2));
}
}));
val symbolsMix = Arrays.asList(
Arrays.asList("&", "and", "et"),
Arrays.asList("le", "the"),
Arrays.asList("#", "No.", "No. ", "№"),
Arrays.asList("contre", "vs.", "vs"),
Arrays.asList("ae", "e"),
Arrays.asList("Yi", "I"),
Arrays.asList("½", "1/2", " 1/2", "1/2 "),
Arrays.asList("¼", "1/4", " 1/4", "1/4 ")
);
symbolsMix.forEach(tuples1 -> {
tuples1.forEach(symbol1 -> {
tuples1.forEach(symbol2 -> {
if (!symbol1.equals(symbol2)) {
list.add(s -> replace(s, symbol1, symbol2));
}
});
});
});

modifiers = ImmutableList.copyOf(list);
}
Expand All @@ -115,26 +122,22 @@ public final class SmartTitleComparator extends AbstractMovieComparator {
*/
@Override
public boolean areEqual(@NonNull Movie movie1, @NonNull Movie movie2) {
val title1 = movie1.getTitle();
val title2 = movie2.getTitle();

for (StringModifier m1 : modifiers) {
for (StringModifier m2 : modifiers) {
String m1Title = m1.modify(movie1.getTitle());
String m2Title = m2.modify(movie2.getTitle());

boolean result = m1Title.equals(m2Title);

LOG.debug(
"Comparing title '{}' with title '{}', matches = '{}'",
m1Title,
m2Title,
result
);
boolean result = m1.modify(title1).equalsIgnoreCase(m2.modify(title2));

if (result) {
LOG.debug("Found match when comparing title '{}' with title '{}'", title1, title2);
return true;
}
}
}

LOG.debug("No matches found when comparing title '{}' with title '{}'!", title1, title2);

return false;
}

Expand Down
14 changes: 7 additions & 7 deletions core/src/main/java/org/f0w/k2i/core/util/MovieUtils.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package org.f0w.k2i.core.util;

import lombok.val;
import org.apache.commons.lang3.StringUtils;
import org.f0w.k2i.core.model.entity.Movie;
import org.f0w.k2i.core.util.exception.KinopoiskToIMDBException;
import org.jsoup.Jsoup;
Expand All @@ -16,6 +15,7 @@
import java.util.stream.Collectors;

import static org.f0w.k2i.core.util.exception.ExceptionUtils.uncheck;
import static org.apache.commons.lang3.StringUtils.replaceEachRepeatedly;

/**
* NullPointer safe class for checking and parsing movie fields.
Expand All @@ -31,11 +31,7 @@ private MovieUtils() {
* @return Parsed title
*/
public static String parseTitle(final String title) {
val resultTitle = StringUtils.replaceEachRepeatedly(
String.valueOf(title).trim(),
new String[]{"«", "»"},
new String[]{"", ""}
);
val resultTitle = String.valueOf(title).trim();

if ("".equals(resultTitle)) {
return "null";
Expand All @@ -55,7 +51,11 @@ public static String parseTitle(final String title, final String fallback) {
val resultTitle = parseTitle(title);

if ("null".equals(resultTitle)) {
return parseTitle(fallback);
return replaceEachRepeatedly(
parseTitle(fallback),
new String[]{"«", "»"},
new String[]{"", ""}
);
}

return resultTitle;
Expand Down
2 changes: 2 additions & 0 deletions core/src/main/java/org/f0w/k2i/core/util/string/Translit.java
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ public final class Translit {
weakerCharTable = Arrays.copyOf(charTable, charTable.length);
weakerCharTable['Ъ' - START_CHAR] = "";
weakerCharTable['Ь' - START_CHAR] = "";
weakerCharTable['Ё' - START_CHAR] = "Yo";

fillWithLowerCaseChars(weakerCharTable);
}
Expand Down Expand Up @@ -130,6 +131,7 @@ public static String toWeakerTranslit(String text) {

Map<String, Pair<String, String>> pairsToReplace = new ImmutableMap.Builder<String, Pair<String, String>>()
.put("ый", Pair.of("yi", "yy"))
.put("ищ", Pair.of("ish", "ishch"))
.build();

pairsToReplace.forEach((search, replacements) -> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,22 +24,34 @@ public void areEqualEqualTitles() throws Exception {
new Movie("Inception", 2010),
new Movie("Inception", 2010)
));

assertTrue(comparator.areEqual(
new Movie("inception", 2010),
new Movie("Inception", 2010)
));

assertTrue(comparator.areEqual(
new Movie("Мы из будущего 2", 2010),
new Movie("Мы из будущего 2", 2010)
));
}

@Test
public void areEqualWithoutCommas() throws Exception {
public void areEqualWithRemovedSymbols() throws Exception {
assertTrue(comparator.areEqual(
new Movie("The boy, who lived", 2010),
new Movie("The boy who lived", 2010)
));
}

@Test
public void areEqualWithoutColon() throws Exception {
assertTrue(comparator.areEqual(
new Movie("Independence Day: Resurgence", 2016),
new Movie("Independence Day Resurgence", 2016)
));

assertTrue(comparator.areEqual(
new Movie("Lock Out", 2012),
new Movie("Lockout", 2012)
));
}

@Test
Expand All @@ -49,11 +61,6 @@ public void areEqualWithoutApostrophesAndQuotes() throws Exception {
new Movie("Операция Ы и другие приключения Шурика", 1965)
));

assertTrue(comparator.areEqual(
new Movie("Белый Бим Черное ухо", 2010),
new Movie("Belyy Bim Chernoe ukho", 2010)
));

assertTrue(comparator.areEqual(
new Movie("Childhood's 'End'", 2015),
new Movie("Childhoods End", 2015)
Expand All @@ -69,26 +76,26 @@ public void areEqualWithoutForeignAccents() {
}

@Test
public void areEqualWithPartBeforeDashSymbol() {
public void areEqualWithPartBeforeOneOfSeparatingSymbols() {
assertTrue(comparator.areEqual(
new Movie("Super Movie - Again", 2002),
new Movie("Super Movie", 2002)
));
}

@Test
public void areEqualWithPartAfterDashSymbol() {
public void areEqualWithThePrefixes() {
assertTrue(comparator.areEqual(
new Movie("Super Movie - Again - And again", 2002),
new Movie("And again", 2002)
new Movie("The Godfather", 1972),
new Movie("Godfather", 1972)
));
}

@Test
public void areEqualWithThePrefix() {
public void areEqualWithThePostfixes() {
assertTrue(comparator.areEqual(
new Movie("The Godfather", 1972),
new Movie("Godfather", 1972)
new Movie("Shaman kingu", 2001),
new Movie("Shaman king", 2001)
));
}

Expand All @@ -100,14 +107,6 @@ public void areEqualWithWhitespaceNormalized() {
));
}

@Test
public void areEqualWeaklyTransliteratedAndCapitalized() {
assertTrue(comparator.areEqual(
new Movie("ЛУЧШИЙ фильм НА ВСЕМ свете", 2010),
new Movie("Luchshii film na vsem svete", 2010)
));
}

@Test
public void areEqualWithNumericRepresentedAsWords() {
assertTrue(comparator.areEqual(
Expand All @@ -126,6 +125,32 @@ public void areEqualWithNumericRepresentedAsWords() {
));
}

@Test
public void areEqualTranslit() {
assertTrue(comparator.areEqual(
new Movie("Остров сокровищ", 1988),
new Movie("Ostrov sokrovishch", 1988)
));

assertTrue(comparator.areEqual(
new Movie("Белый Бим Черное ухо", 1976),
new Movie("Belyy Bim Chernoe ukho", 1976)
));

assertTrue(comparator.areEqual(
new Movie("Шерлок Холмс и доктор Ватсон: Знакомство", 1979),
new Movie("Sherlok Kholms i doktor Vatson: Znakomstvo", 1979)
));
}

@Test
public void areEqualWithoutSpecialSymbols() {
assertTrue(comparator.areEqual(
new Movie("...А зори здесь тихие", 1975),
new Movie("А зори здесь тихие", 1975)
));
}

@Test
public void areEqualWithReplacedSymbols() {
assertTrue(comparator.areEqual(
Expand All @@ -137,5 +162,25 @@ public void areEqualWithReplacedSymbols() {
new Movie("Mr. Peabody & Sherman", 2014),
new Movie("Mr. Peabody and Sherman", 2014)
));

assertTrue(comparator.areEqual(
new Movie("Легенда №17", 2013),
new Movie("Легенда No. 17", 2013)
));

assertTrue(comparator.areEqual(
new Movie("Yip Man", 2008),
new Movie("Ip Man", 2008)
));

assertTrue(comparator.areEqual(
new Movie("The Lion King 1½", 2004),
new Movie("The Lion King 1 1/2", 2004)
));

assertTrue(comparator.areEqual(
new Movie("Nanny McPhee and the Big Bang", 2010),
new Movie("Nanny McPhee et le Big Bang", 2010)
));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ public void testParseTitle() throws Exception {
assertEquals("Inception", parseTitle("Inception "));
assertEquals("Inception", parseTitle(" Inception"));
assertEquals(
"Операция Ы и другие приключения Шурика", parseTitle("Операция «Ы» и другие «приключения» Шурика")
"Операция Ы и другие приключения Шурика", parseTitle(null, "Операция «Ы» и другие «приключения» Шурика")
);
assertEquals("null", parseTitle(null));
assertEquals("null", parseTitle(""));
Expand Down

0 comments on commit 9df9980

Please sign in to comment.