diff --git a/src/centroid_update_worker/test_functional/test_CentroidUpdateWorker.cpp b/src/centroid_update_worker/test_functional/test_CentroidUpdateWorker.cpp index 16ea903..d08f0a8 100644 --- a/src/centroid_update_worker/test_functional/test_CentroidUpdateWorker.cpp +++ b/src/centroid_update_worker/test_functional/test_CentroidUpdateWorker.cpp @@ -57,9 +57,10 @@ struct UpdateWorkerTestCtx { UpdateWorkerTestCtx() { threadPool1.reset(new wangle::FutureExecutor(2)); threadPool2.reset(new wangle::FutureExecutor(2)); + sysClock.reset(new Clock); UniquePointer rockHandle(new InMemoryRockHandle("foo")); UniquePointer syncPersistence( - new SyncPersistence(std::move(rockHandle)) + new SyncPersistence(sysClock, std::move(rockHandle)) ); shared_ptr result( new Persistence(std::move(syncPersistence), threadPool1) @@ -67,7 +68,6 @@ struct UpdateWorkerTestCtx { persistence = result; metadb.reset(new CentroidMetadataDb(persistence)); accumulatorFactory.reset(new DocumentAccumulatorFactory); - sysClock.reset(new Clock); updaterFactory.reset(new CentroidUpdaterFactory( persistence, metadb, accumulatorFactory, sysClock )); diff --git a/src/document_processing_worker/test_functional/test_DocumentProcessingWorker.cpp b/src/document_processing_worker/test_functional/test_DocumentProcessingWorker.cpp index 213455f..1b06f0c 100644 --- a/src/document_processing_worker/test_functional/test_DocumentProcessingWorker.cpp +++ b/src/document_processing_worker/test_functional/test_DocumentProcessingWorker.cpp @@ -23,7 +23,7 @@ #include "document_processing_worker/DocumentProcessor.h" #include "document_processing_worker/DocumentProcessingWorker.h" #include "stopwords/StopwordFilter.h" -#include "stemmer/ThreadSafeUtf8Stemmer.h" +#include "stemmer/ThreadSafeStemmerManager.h" #include "models/ProcessedDocument.h" #include "models/Centroid.h" #include "models/Document.h" @@ -44,7 +44,6 @@ using namespace relevanced::centroid_update_worker; using namespace relevanced::document_processing_worker; using namespace relevanced::stemmer; using namespace relevanced::stopwords; -using namespace relevanced::tokenizer; using relevanced::thrift_protocol::Language; using ::testing::Return; @@ -55,7 +54,7 @@ struct ProcessingWorkerTestCtx { shared_ptr persistence; shared_ptr hasher; shared_ptr sysClock; - shared_ptr stemmer; + shared_ptr stemmerManager; shared_ptr stopwordFilter; shared_ptr> threadPool1; shared_ptr> threadPool2; @@ -66,18 +65,18 @@ struct ProcessingWorkerTestCtx { threadPool1.reset(new wangle::FutureExecutor(2)); threadPool2.reset(new wangle::FutureExecutor(2)); UniquePointer rockHandle(new InMemoryRockHandle("foo")); + sysClock.reset(new Clock); UniquePointer syncPersistence( - new SyncPersistence(std::move(rockHandle)) + new SyncPersistence(sysClock, std::move(rockHandle)) ); persistence.reset( new Persistence(std::move(syncPersistence), threadPool1) ); hasher.reset(new Sha1Hasher); - sysClock.reset(new Clock); - stemmer.reset(new ThreadSafeUtf8Stemmer); + stemmerManager.reset(new ThreadSafeStemmerManager); stopwordFilter.reset(new StopwordFilter); processor.reset( - new DocumentProcessor(stemmer, stopwordFilter, sysClock) + new DocumentProcessor(stemmerManager, stopwordFilter, sysClock) ); worker.reset(new DocumentProcessingWorker( processor, hasher, threadPool2 diff --git a/src/server/RelevanceServer.cpp b/src/server/RelevanceServer.cpp index 1ebf657..7713928 100644 --- a/src/server/RelevanceServer.cpp +++ b/src/server/RelevanceServer.cpp @@ -400,6 +400,15 @@ Future>> RelevanceServer::listAllDocuments() { }); } +Future>> RelevanceServer::listUnusedDocuments( + size_t count) { + return persistence_->listUnusedDocuments(count) + .then([](vector docIds) { + return std::move( + folly::make_unique>(docIds) + ); + }); +} Future>> RelevanceServer::listDocumentRange( size_t offset, size_t count) { diff --git a/src/server/RelevanceServer.h b/src/server/RelevanceServer.h index 93bdb4a..7ffeba7 100644 --- a/src/server/RelevanceServer.h +++ b/src/server/RelevanceServer.h @@ -104,6 +104,9 @@ class RelevanceServerIf { virtual folly::Future>> listAllDocuments() = 0; + virtual folly::Future>> + listUnusedDocuments(size_t count) = 0; + virtual folly::Future>> listCentroidRange(size_t offset, size_t count) = 0; @@ -305,6 +308,9 @@ class RelevanceServer : public RelevanceServerIf { folly::Future>> listAllDocuments() override; + folly::Future>> + listUnusedDocuments(size_t count) override; + folly::Future>> listCentroidRange(size_t offset, size_t count) override; diff --git a/src/server/test_functional/test_RelevanceServer.cpp b/src/server/test_functional/test_RelevanceServer.cpp index 161dbe5..a6ca1c8 100644 --- a/src/server/test_functional/test_RelevanceServer.cpp +++ b/src/server/test_functional/test_RelevanceServer.cpp @@ -32,7 +32,8 @@ #include "document_processing_worker/DocumentProcessingWorker.h" #include "similarity_score_worker/SimilarityScoreWorker.h" #include "stopwords/StopwordFilter.h" -#include "stemmer/ThreadSafeUtf8Stemmer.h" +#include "stemmer/Utf8Stemmer.h" +#include "stemmer/ThreadSafeStemmerManager.h" #include "server/RelevanceServer.h" #include "models/ProcessedDocument.h" #include "server/RelevanceServer.h" @@ -58,11 +59,8 @@ using namespace relevanced::similarity_score_worker; using namespace relevanced::stemmer; using namespace relevanced::stopwords; using namespace relevanced::server; -using namespace relevanced::tokenizer; using namespace relevanced::thrift_protocol; - - using ::testing::Return; using ::testing::_; @@ -72,7 +70,7 @@ struct RelevanceServerTestCtx { shared_ptr metadb; shared_ptr hasher; shared_ptr sysClock; - shared_ptr stemmer; + shared_ptr stemmerManager; shared_ptr stopwordFilter; shared_ptr processor; shared_ptr updaterFactory; @@ -92,17 +90,17 @@ struct RelevanceServerTestCtx { scoringThreads.reset(new wangle::FutureExecutor(2)); updatingThreads.reset(new wangle::FutureExecutor(2)); UniquePointer rockHandle(new InMemoryRockHandle("foo")); + sysClock.reset(new Clock); UniquePointer syncPersistence( - new SyncPersistence(std::move(rockHandle)) + new SyncPersistence(sysClock, std::move(rockHandle)) ); persistence.reset(new Persistence(std::move(syncPersistence), persistenceThreads)); hasher.reset(new Sha1Hasher); metadb.reset(new CentroidMetadataDb(persistence)); - stemmer.reset(new ThreadSafeUtf8Stemmer); + stemmerManager.reset(new ThreadSafeStemmerManager); stopwordFilter.reset(new StopwordFilter); - sysClock.reset(new Clock); processor.reset( - new DocumentProcessor(stemmer, stopwordFilter, sysClock) + new DocumentProcessor(stemmerManager, stopwordFilter, sysClock) ); accumulatorFactory.reset(new DocumentAccumulatorFactory); updaterFactory.reset(new CentroidUpdaterFactory( @@ -152,7 +150,8 @@ TEST(RelevanceServer, TestAddDocumentToCentroidHappy) { ctx.persistence->saveCentroid("centroid-id", centroid).get(); ctx.server->createDocumentWithID( folly::make_unique("doc-id"), - folly::make_unique("some text about dogs") + folly::make_unique("some text about dogs"), + Language::EN ).get(); auto response = ctx.server->addDocumentToCentroid( folly::make_unique("centroid-id"), @@ -172,7 +171,8 @@ TEST(RelevanceServer, TestAddDocumentToCentroidAlreadyInCentroid) { ctx.persistence->saveCentroid("centroid-id", centroid).get(); ctx.server->createDocumentWithID( folly::make_unique("doc-id"), - folly::make_unique("some text about dogs") + folly::make_unique("some text about dogs"), + Language::EN ).get(); auto response1 = ctx.server->addDocumentToCentroid( folly::make_unique("centroid-id"), @@ -194,7 +194,8 @@ TEST(RelevanceServer, TestAddDocumentToCentroidMissingDocument) { ctx.persistence->saveCentroid("centroid-id", centroid).get(); ctx.server->createDocumentWithID( folly::make_unique("unrelated-doc-id"), - folly::make_unique("some text about dogs") + folly::make_unique("some text about dogs"), + Language::EN ).get(); auto response = ctx.server->addDocumentToCentroid( folly::make_unique("centroid-id"), @@ -211,7 +212,8 @@ TEST(RelevanceServer, TestAddDocumentToCentroidMissingCentroid) { ctx.persistence->saveCentroid("centroid-id", centroid).get(); ctx.server->createDocumentWithID( folly::make_unique("doc-id"), - folly::make_unique("some text about dogs") + folly::make_unique("some text about dogs"), + Language::EN ).get(); auto response = ctx.server->addDocumentToCentroid( folly::make_unique("missing-centroid-id"), @@ -228,7 +230,8 @@ TEST(RelevanceServer, TestAddDocumentToCentroidMissingBoth) { ctx.persistence->saveCentroid("centroid-id", centroid).get(); ctx.server->createDocumentWithID( folly::make_unique("doc-id"), - folly::make_unique("some text about dogs") + folly::make_unique("some text about dogs"), + Language::EN ).get(); auto response = ctx.server->addDocumentToCentroid( folly::make_unique("missing-centroid-id"), @@ -249,7 +252,8 @@ TEST(RelevanceServer, TestRemoveDocumentFromCentroidHappy) { saves.push_back( ctx.server->createDocumentWithID( folly::make_unique("doc-id"), - folly::make_unique("some text about dogs") + folly::make_unique("some text about dogs"), + Language::EN ).then([](Try> result) { EXPECT_FALSE(result.hasException()); return Try(true); @@ -280,7 +284,8 @@ TEST(RelevanceServer, TestRemoveDocumentFromCentroidDocumentNotInCentroid) { saves.push_back( ctx.server->createDocumentWithID( folly::make_unique("doc-id"), - folly::make_unique("some text about dogs") + folly::make_unique("some text about dogs"), + Language::EN ).then([](Try> result) { EXPECT_FALSE(result.hasException()); return Try(true); @@ -315,7 +320,8 @@ TEST(RelevanceServer, TestRemoveDocumentFromCentroidMissingCentroid) { ctx.persistence->saveCentroid("centroid-id", centroid).get(); ctx.server->createDocumentWithID( folly::make_unique("doc-id"), - folly::make_unique("some text about dogs") + folly::make_unique("some text about dogs"), + Language::EN ).get(); auto response = ctx.server->removeDocumentFromCentroid( folly::make_unique("missing-centroid-id"), @@ -332,7 +338,8 @@ TEST(RelevanceServer, TestRemoveDocumentFromCentroidMissingBoth) { ctx.persistence->saveCentroid("centroid-id", centroid).get(); ctx.server->createDocumentWithID( folly::make_unique("doc-id"), - folly::make_unique("some text about dogs") + folly::make_unique("some text about dogs"), + Language::EN ).get(); auto response = ctx.server->removeDocumentFromCentroid( folly::make_unique("missing-centroid-id"), @@ -350,7 +357,8 @@ TEST(RelevanceServer, TestGetTextSimilarityHappy) { ctx.scoreWorker->reloadCentroid("centroid-id").get(); auto scoreResponse = ctx.server->getTextSimilarity( folly::make_unique("centroid-id"), - folly::make_unique("This is some dog related text which is also about a cat.") + folly::make_unique("This is some dog related text which is also about a cat."), + Language::EN ).get(); EXPECT_FALSE(scoreResponse.hasException()); auto similarity = scoreResponse.value(); @@ -367,7 +375,8 @@ TEST(RelevanceServer, TestGetTextSimilarityMissingCentroid) { ctx.scoreWorker->reloadCentroid("centroid-id").get(); auto scoreResponse = ctx.server->getTextSimilarity( folly::make_unique("unrelated-centroid-id"), - folly::make_unique("This is some dog related text which is also about a cat.") + folly::make_unique("This is some dog related text which is also about a cat."), + Language::EN ).get(); EXPECT_TRUE(scoreResponse.hasException()); } @@ -391,7 +400,8 @@ TEST(RelevanceServer, TestMultiGetTextSimilarityHappy) { vector centroidIds {"centroid-1-id", "centroid-2-id"}; auto scoreResponse = ctx.server->multiGetTextSimilarity( folly::make_unique>(centroidIds), - folly::make_unique("This is some dog related text which is also about a cat.") + folly::make_unique("This is some dog related text which is also about a cat."), + Language::EN ).get(); EXPECT_FALSE(scoreResponse.hasException()); } @@ -401,7 +411,9 @@ TEST(RelevanceServer, TestCreateDocument) { RelevanceServerTestCtx ctx; auto text = folly::make_unique("some text about cats and dogs and fish and so forth"); - auto response = ctx.server->createDocument(std::move(text)).get(); + auto response = ctx.server->createDocument( + std::move(text), Language::EN + ).get(); EXPECT_TRUE(response.hasValue()); string docId = *response.value(); auto persisted = ctx.persistence->loadDocumentOption(docId).get(); @@ -413,7 +425,9 @@ TEST(RelevanceServer, TestCreateDocumentWithID) { RelevanceServerTestCtx ctx; auto text = folly::make_unique("some text about cats and dogs and fish and so forth"); auto id = folly::make_unique("doc-id"); - auto response = ctx.server->createDocumentWithID(std::move(id), std::move(text)).get(); + auto response = ctx.server->createDocumentWithID( + std::move(id), std::move(text), Language::EN + ).get(); EXPECT_TRUE(response.hasValue()); EXPECT_EQ("doc-id", *response.value()); auto persisted = ctx.persistence->loadDocumentOption("doc-id").get(); @@ -425,11 +439,14 @@ TEST(RelevanceServer, TestCreateDocumentWithIDAlreadyExists) { RelevanceServerTestCtx ctx; auto text = folly::make_unique("some text about cats and dogs and fish and so forth"); auto id = folly::make_unique("doc-id"); - auto response1 = ctx.server->createDocumentWithID(std::move(id), std::move(text)).get(); + auto response1 = ctx.server->createDocumentWithID( + std::move(id), std::move(text), Language::EN + ).get(); EXPECT_FALSE(response1.hasException()); auto response2 = ctx.server->createDocumentWithID( folly::make_unique("doc-id"), - folly::make_unique("some text") + folly::make_unique("some text"), + Language::EN ).get(); EXPECT_TRUE(response2.hasException()); } @@ -527,7 +544,8 @@ TEST(RelevanceServer, TestListAllDocuments) { expectedIds.insert(id); creations.push_back(ctx.server->createDocumentWithID( folly::make_unique(id), - folly::make_unique("this is some text about things") + folly::make_unique("this is some text about things"), + Language::EN )); } set createdIds; @@ -553,7 +571,8 @@ TEST(RelevanceServer, TestListDocumentRange) { auto id = sformat("some-doc-{}", i); creations.push_back(ctx.server->createDocumentWithID( folly::make_unique(id), - folly::make_unique("this is some text about things") + folly::make_unique("this is some text about things"), + Language::EN )); } collect(creations).get(); @@ -575,7 +594,8 @@ TEST(RelevanceServer, TestListDocumentRangeFromID) { auto id = sformat("some-doc-{}", i); creations.push_back(ctx.server->createDocumentWithID( folly::make_unique(id), - folly::make_unique("this is some text about things") + folly::make_unique("this is some text about things"), + Language::EN )); } collect(creations).get(); @@ -599,7 +619,8 @@ TEST(RelevanceServer, TestDeleteDocument) { auto id = sformat("some-doc-{}", i); creations.push_back(ctx.server->createDocumentWithID( folly::make_unique(id), - folly::make_unique("this is some text about things") + folly::make_unique("this is some text about things"), + Language::EN )); } collect(creations).get(); @@ -623,10 +644,40 @@ TEST(RelevanceServer, TestDeleteDocumentMissing) { auto id = sformat("some-doc-{}", i); creations.push_back(ctx.server->createDocumentWithID( folly::make_unique(id), - folly::make_unique("this is some text about things") + folly::make_unique("this is some text about things"), + Language::EN )); } collect(creations).get(); auto result = ctx.server->deleteDocument(folly::make_unique("some-doc-8")).get(); EXPECT_TRUE(result.hasException()); } + +// TEST(RelevanceServer, TestListUnusedDocuments) { +// RelevanceServerTestCtx ctx; +// vector>>> documentCreations; +// for (size_t i = 0; i < 6; i++) { +// auto id = sformat("some-doc-{}", i); +// documentCreations.push_back(ctx.server->createDocumentWithID( +// folly::make_unique(id), +// folly::make_unique("this is some text about things"), +// Language::EN +// )); +// } +// collect(documentCreations).get(); +// ctx.server->createCentroid("c1").get(); +// vector>> additions; +// vector toAdd {"some-doc-1", "some-doc-3", "some-doc-4"}; +// for (string id: toAdd) { +// additions.push_back(ctx.server->addDocumentToCentroid( +// folly::make_unique("c1"), +// folly::make_unique(id) +// )); +// } +// collect(additions).get(); +// auto unused = ctx.server->listUnusedDocuments(10).get(); +// vector expected { +// "some-doc-0", "some-doc-2", "some-doc-5" +// }; +// EXPECT_EQ(expected, unused); +// } diff --git a/src/similarity_score_worker/test_functional/test_SimilarityScoreWorker.cpp b/src/similarity_score_worker/test_functional/test_SimilarityScoreWorker.cpp index 1950b18..b84ac12 100644 --- a/src/similarity_score_worker/test_functional/test_SimilarityScoreWorker.cpp +++ b/src/similarity_score_worker/test_functional/test_SimilarityScoreWorker.cpp @@ -29,7 +29,6 @@ #include "document_processing_worker/DocumentProcessingWorker.h" #include "similarity_score_worker/SimilarityScoreWorker.h" #include "stopwords/StopwordFilter.h" -#include "stemmer/ThreadSafeUtf8Stemmer.h" #include "models/ProcessedDocument.h" #include "models/Centroid.h" #include "models/Document.h" @@ -52,7 +51,6 @@ using namespace relevanced::document_processing_worker; using namespace relevanced::similarity_score_worker; using namespace relevanced::stemmer; using namespace relevanced::stopwords; -using namespace relevanced::tokenizer; using thrift_protocol::ECentroidDoesNotExist; @@ -73,13 +71,13 @@ struct SimilarityWorkerTestCtx { threadPool1.reset(new wangle::FutureExecutor(2)); threadPool2.reset(new wangle::FutureExecutor(2)); UniquePointer rockHandle(new InMemoryRockHandle("foo")); + sysClock.reset(new Clock); UniquePointer syncPersistence( - new SyncPersistence(std::move(rockHandle)) + new SyncPersistence(sysClock, std::move(rockHandle)) ); persistence.reset(new Persistence(std::move(syncPersistence), threadPool1)); hasher.reset(new Sha1Hasher); metadb.reset(new CentroidMetadataDb(persistence)); - sysClock.reset(new Clock); worker.reset(new SimilarityScoreWorker( persistence, metadb, threadPool2 )); diff --git a/src/stemmer/ThreadSafeUtf8Stemmer.h b/src/stemmer/ThreadSafeUtf8Stemmer.h deleted file mode 100644 index 7879c96..0000000 --- a/src/stemmer/ThreadSafeUtf8Stemmer.h +++ /dev/null @@ -1,22 +0,0 @@ -#pragma once -#include "stemmer/StemmerIf.h" -#include "stemmer/Utf8Stemmer.h" - -#include - -namespace relevanced { -namespace stemmer { - -class ThreadSafeUtf8Stemmer: public StemmerIf { -protected: - folly::ThreadLocal stemmer_; -public: - ThreadSafeUtf8Stemmer(){} - size_t getStemPos(const char *toStem, size_t length) override { - return stemmer_->getStemPos(toStem, length); - } - -}; - -} // text_util -} // relevanced