diff --git a/Cargo.lock b/Cargo.lock index 5bb40029..4c1b75ac 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4996,6 +4996,7 @@ dependencies = [ "tracing", "tracing-subscriber", "tracing-test", + "unicode-segmentation", "url", "utoipa", "utoipa-swagger-ui", diff --git a/Cargo.toml b/Cargo.toml index e9ae94dc..7e9ad95a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -165,6 +165,7 @@ tower-http = {version = "0.5.0", features = ["compression-gzip", "cors"]} tracing = {version = "0.1.34", features = ["release_max_level_info"]} tracing-subscriber = {version = "0.3.11", features = ["env-filter"]} tracing-test = "0.2.4" +unicode-segmentation = "1.11.0" url = {version = "2.4.0", features = ["serde"]} utoipa = {version = "4.2.3", features = ["axum_extras"]} utoipa-swagger-ui = {version = "7.0.0", features = ["axum"]} diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index 37b636c7..1941940f 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -102,6 +102,7 @@ tower-http.workspace = true tower.workspace = true tracing-subscriber.workspace = true tracing.workspace = true +unicode-segmentation.workspace = true url.workspace = true utoipa-swagger-ui.workspace = true utoipa.workspace = true diff --git a/crates/core/src/ampc/dht/mod.rs b/crates/core/src/ampc/dht/mod.rs index 9498ef8b..92036334 100644 --- a/crates/core/src/ampc/dht/mod.rs +++ b/crates/core/src/ampc/dht/mod.rs @@ -146,7 +146,6 @@ pub mod tests { use openraft::{error::InitializeError, Config}; use proptest::prelude::*; - use proptest_derive::Arbitrary; use futures::{pin_mut, TryStreamExt}; use rand::seq::SliceRandom; @@ -557,7 +556,6 @@ pub mod tests { bincode::Encode, bincode::Decode, PartialEq, - Arbitrary, )] enum Action { Set { key: String, value: String }, @@ -566,6 +564,21 @@ pub mod tests { Get { prev_key: usize }, } + impl Arbitrary for Action { + type Parameters = (); + type Strategy = BoxedStrategy; + + fn arbitrary_with(_args: ()) -> Self::Strategy { + prop_oneof![ + (".{1,10}", ".{1,10}").prop_map(|(key, value)| Action::Set { key, value }), + (0..1000).prop_map(|prev_key| Action::Get { + prev_key: prev_key as usize + }), + ] + .boxed() + } + } + proptest! { #![proptest_config(ProptestConfig::with_cases(10))] diff --git a/crates/core/src/distributed/sonic/mod.rs b/crates/core/src/distributed/sonic/mod.rs index bfe63b15..84a686ec 100644 --- a/crates/core/src/distributed/sonic/mod.rs +++ b/crates/core/src/distributed/sonic/mod.rs @@ -306,7 +306,6 @@ mod tests { use std::{collections::HashMap, future::Future}; use proptest::prelude::*; - use proptest_derive::Arbitrary; use crate::free_socket_addr; @@ -343,12 +342,26 @@ mod tests { }) } - #[derive(Debug, Clone, bincode::Encode, bincode::Decode, PartialEq, Arbitrary)] + #[derive(Debug, Clone, bincode::Encode, bincode::Decode, PartialEq)] struct Message { text: String, other: HashMap, } + impl Arbitrary for Message { + type Parameters = (); + type Strategy = BoxedStrategy; + + fn arbitrary_with(_args: ()) -> Self::Strategy { + ( + any::(), + prop::collection::hash_map(".*", 0.0f32..100.0f32, 0..10), + ) + .prop_map(|(text, other)| Message { text, other }) + .boxed() + } + } + proptest! { #[test] fn basic_arb(a1: Message, b1: Message) { diff --git a/crates/core/src/distributed/sonic/service.rs b/crates/core/src/distributed/sonic/service.rs index b7d3e3a4..489a0276 100644 --- a/crates/core/src/distributed/sonic/service.rs +++ b/crates/core/src/distributed/sonic/service.rs @@ -335,9 +335,8 @@ mod tests { mod counter_service { use std::sync::atomic::AtomicI32; - use proptest_derive::Arbitrary; - use super::super::Message; + use proptest::prelude::*; pub struct CounterService { pub counter: AtomicI32, @@ -346,17 +345,21 @@ mod tests { sonic_service!(CounterService, [Change, Reset]); #[derive( - Debug, - Clone, - serde::Serialize, - serde::Deserialize, - bincode::Encode, - bincode::Decode, - Arbitrary, + Debug, Clone, serde::Serialize, serde::Deserialize, bincode::Encode, bincode::Decode, )] pub struct Change { pub amount: i32, } + + impl Arbitrary for Change { + type Parameters = (); + type Strategy = BoxedStrategy; + + fn arbitrary_with(_args: Self::Parameters) -> Self::Strategy { + (0..100).prop_map(|amount| Change { amount }).boxed() + } + } + #[derive( Debug, Clone, serde::Serialize, serde::Deserialize, bincode::Encode, bincode::Decode, )] diff --git a/crates/core/src/entity_index/mod.rs b/crates/core/src/entity_index/mod.rs index 24619b46..c27022dd 100644 --- a/crates/core/src/entity_index/mod.rs +++ b/crates/core/src/entity_index/mod.rs @@ -34,7 +34,7 @@ use tantivy::{ use crate::{ image_store::{EntityImageStore, Image, ImageStore}, inverted_index::merge_tantivy_segments, - tokenizer::Normal, + tokenizer::fields::DefaultTokenizer, Result, }; @@ -49,7 +49,7 @@ fn schema() -> Schema { TextOptions::default() .set_indexing_options( TextFieldIndexing::default() - .set_tokenizer(Normal::as_str()) + .set_tokenizer(DefaultTokenizer::as_str()) .set_index_option(IndexRecordOption::WithFreqsAndPositions), ) .set_stored(), @@ -59,7 +59,7 @@ fn schema() -> Schema { TextOptions::default() .set_indexing_options( TextFieldIndexing::default() - .set_tokenizer(Normal::as_str()) + .set_tokenizer(DefaultTokenizer::as_str()) .set_index_option(IndexRecordOption::WithFreqsAndPositions), ) .set_stored(), @@ -164,8 +164,8 @@ impl EntityIndex { .collect(); tantivy_index.tokenizers().register( - Normal::as_str(), - Normal::with_stopwords(stopwords.clone().into_iter().collect()), + DefaultTokenizer::as_str(), + DefaultTokenizer::with_stopwords(stopwords.clone().into_iter().collect()), ); let image_store = EntityImageStore::open(path.as_ref().join("images")); @@ -270,7 +270,7 @@ impl EntityIndex { let entity_abstract = self.schema.get_field("abstract").unwrap(); let mut term_queries = Vec::new(); - let mut tokenizer = Normal::default(); + let mut tokenizer = DefaultTokenizer::default(); let mut stream = tokenizer.token_stream(query); while let Some(token) = stream.next() { if self.stopwords.contains(&token.text) { diff --git a/crates/core/src/feed/index.rs b/crates/core/src/feed/index.rs index 9ae1ecf1..9593d23b 100644 --- a/crates/core/src/feed/index.rs +++ b/crates/core/src/feed/index.rs @@ -21,7 +21,7 @@ use std::{ use crate::{ inverted_index::merge_tantivy_segments, - tokenizer::{Tokenizer, UrlTokenizer}, + tokenizer::fields::{FieldTokenizer, UrlTokenizer}, }; use anyhow::Result; use hashbrown::HashSet; @@ -45,8 +45,8 @@ pub struct FeedIndex { impl FeedIndex { pub fn open>(path: P) -> Result { - let url_tokenizer = Tokenizer::Url(UrlTokenizer); - let kind_tokenizer = Tokenizer::default(); + let url_tokenizer = FieldTokenizer::Url(UrlTokenizer); + let kind_tokenizer = FieldTokenizer::default(); let mut builder = tantivy::schema::Schema::builder(); diff --git a/crates/core/src/inverted_index/mod.rs b/crates/core/src/inverted_index/mod.rs index e810702d..c2e9495f 100644 --- a/crates/core/src/inverted_index/mod.rs +++ b/crates/core/src/inverted_index/mod.rs @@ -48,14 +48,14 @@ use crate::ranking::initial::Score; use crate::schema::text_field::TextField; use crate::schema::{numerical_field, text_field, Field, NumericalFieldEnum, TextFieldEnum}; use crate::snippet::TextSnippet; -use crate::tokenizer::{ +use crate::tokenizer::fields::{ BigramTokenizer, Identity, JsonField, Stemmed, TrigramTokenizer, UrlTokenizer, }; use crate::webpage::region::Region; use crate::webpage::schema_org; use crate::Result; -use crate::{schema::create_schema, tokenizer::Tokenizer}; +use crate::{schema::create_schema, tokenizer::FieldTokenizer}; use std::fs; use std::path::Path; use std::sync::Arc; @@ -109,25 +109,25 @@ impl From for tantivy::DocAddress { } fn register_tokenizers(manager: &TokenizerManager) { - let tokenizer = Tokenizer::default(); + let tokenizer = FieldTokenizer::default(); manager.register(tokenizer.as_str(), tokenizer); - let tokenizer = Tokenizer::Stemmed(Stemmed::default()); + let tokenizer = FieldTokenizer::Stemmed(Stemmed::default()); manager.register(tokenizer.as_str(), tokenizer); - let tokenizer = Tokenizer::Identity(Identity::default()); + let tokenizer = FieldTokenizer::Identity(Identity::default()); manager.register(tokenizer.as_str(), tokenizer); - let tokenizer = Tokenizer::Bigram(BigramTokenizer::default()); + let tokenizer = FieldTokenizer::Bigram(BigramTokenizer::default()); manager.register(tokenizer.as_str(), tokenizer); - let tokenizer = Tokenizer::Trigram(TrigramTokenizer::default()); + let tokenizer = FieldTokenizer::Trigram(TrigramTokenizer::default()); manager.register(tokenizer.as_str(), tokenizer); - let tokenizer = Tokenizer::Url(UrlTokenizer); + let tokenizer = FieldTokenizer::Url(UrlTokenizer); manager.register(tokenizer.as_str(), tokenizer); - let tokenizer = Tokenizer::Json(JsonField); + let tokenizer = FieldTokenizer::Json(JsonField); manager.register(tokenizer.as_str(), tokenizer); } diff --git a/crates/core/src/schema/text_field.rs b/crates/core/src/schema/text_field.rs index 9ccb61ec..b7c4d456 100644 --- a/crates/core/src/schema/text_field.rs +++ b/crates/core/src/schema/text_field.rs @@ -29,8 +29,9 @@ use crate::{ enum_dispatch_from_discriminant, enum_map::InsertEnumMapKey, ranking::bm25::Bm25Constants, - tokenizer::{ - self, BigramTokenizer, Identity, JsonField, Tokenizer, TrigramTokenizer, UrlTokenizer, + tokenizer, + tokenizer::fields::{ + BigramTokenizer, FieldTokenizer, Identity, JsonField, TrigramTokenizer, UrlTokenizer, }, webpage::Html, Result, @@ -64,11 +65,11 @@ pub trait TextField: } #[allow(unused_variables)] - fn tokenizer(&self, lang: Option<&whatlang::Lang>) -> Tokenizer { - Tokenizer::default() + fn tokenizer(&self, lang: Option<&whatlang::Lang>) -> FieldTokenizer { + FieldTokenizer::default() } - fn query_tokenizer(&self, lang: Option<&whatlang::Lang>) -> Tokenizer { + fn query_tokenizer(&self, lang: Option<&whatlang::Lang>) -> FieldTokenizer { self.tokenizer(lang) } @@ -372,10 +373,10 @@ impl TextField for StemmedTitle { "stemmed_title" } - fn tokenizer(&self, lang: Option<&whatlang::Lang>) -> Tokenizer { + fn tokenizer(&self, lang: Option<&whatlang::Lang>) -> FieldTokenizer { match lang { - Some(lang) => tokenizer::Stemmed::with_forced_language(*lang).into(), - None => tokenizer::Stemmed::default().into(), + Some(lang) => tokenizer::fields::Stemmed::with_forced_language(*lang).into(), + None => tokenizer::fields::Stemmed::default().into(), } } @@ -418,10 +419,10 @@ impl TextField for StemmedCleanBody { "stemmed_body" } - fn tokenizer(&self, lang: Option<&whatlang::Lang>) -> Tokenizer { + fn tokenizer(&self, lang: Option<&whatlang::Lang>) -> FieldTokenizer { match lang { - Some(lang) => tokenizer::Stemmed::with_forced_language(*lang).into(), - None => tokenizer::Stemmed::default().into(), + Some(lang) => tokenizer::fields::Stemmed::with_forced_language(*lang).into(), + None => tokenizer::fields::Stemmed::default().into(), } } @@ -543,8 +544,8 @@ impl TextField for UrlNoTokenizer { "url_no_tokenizer" } - fn tokenizer(&self, _: Option<&whatlang::Lang>) -> Tokenizer { - Tokenizer::Identity(Identity {}) + fn tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer { + FieldTokenizer::Identity(Identity {}) } fn is_searchable(&self) -> bool { @@ -590,8 +591,8 @@ impl TextField for UrlForSiteOperator { true } - fn tokenizer(&self, _: Option<&whatlang::Lang>) -> Tokenizer { - Tokenizer::Url(UrlTokenizer) + fn tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer { + FieldTokenizer::Url(UrlTokenizer) } fn add_html_tantivy( @@ -673,8 +674,8 @@ impl TextField for SiteNoTokenizer { "site_no_tokenizer" } - fn tokenizer(&self, _: Option<&whatlang::Lang>) -> Tokenizer { - Tokenizer::Identity(Identity {}) + fn tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer { + FieldTokenizer::Identity(Identity {}) } fn is_searchable(&self) -> bool { @@ -716,8 +717,8 @@ impl TextField for DomainNoTokenizer { "domain_no_tokenizer" } - fn tokenizer(&self, _: Option<&whatlang::Lang>) -> Tokenizer { - Tokenizer::Identity(Identity {}) + fn tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer { + FieldTokenizer::Identity(Identity {}) } fn is_searchable(&self) -> bool { @@ -759,8 +760,8 @@ impl TextField for DomainNameNoTokenizer { "domain_name_no_tokenizer" } - fn tokenizer(&self, _: Option<&whatlang::Lang>) -> Tokenizer { - Tokenizer::Identity(Identity {}) + fn tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer { + FieldTokenizer::Identity(Identity {}) } fn is_searchable(&self) -> bool { @@ -802,8 +803,8 @@ impl TextField for SiteIfHomepageNoTokenizer { "site_if_homepage_no_tokenizer" } - fn tokenizer(&self, _: Option<&whatlang::Lang>) -> Tokenizer { - Tokenizer::Identity(Identity {}) + fn tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer { + FieldTokenizer::Identity(Identity {}) } fn add_html_tantivy( @@ -882,8 +883,8 @@ impl TextField for DomainNameIfHomepageNoTokenizer { "domain_name_if_homepage_no_tokenizer" } - fn tokenizer(&self, _: Option<&whatlang::Lang>) -> Tokenizer { - Tokenizer::Identity(Identity {}) + fn tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer { + FieldTokenizer::Identity(Identity {}) } fn add_html_tantivy( @@ -929,8 +930,8 @@ impl TextField for DomainIfHomepageNoTokenizer { "domain_if_homepage_no_tokenizer" } - fn tokenizer(&self, _: Option<&whatlang::Lang>) -> Tokenizer { - Tokenizer::Identity(Identity {}) + fn tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer { + FieldTokenizer::Identity(Identity {}) } fn add_html_tantivy( @@ -1125,8 +1126,8 @@ impl TextField for SchemaOrgJson { "schema_org_json" } - fn tokenizer(&self, _: Option<&whatlang::Lang>) -> Tokenizer { - Tokenizer::Identity(Identity {}) + fn tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer { + FieldTokenizer::Identity(Identity {}) } fn is_stored(&self) -> bool { @@ -1161,8 +1162,8 @@ impl TextField for FlattenedSchemaOrgJson { true } - fn tokenizer(&self, _: Option<&whatlang::Lang>) -> Tokenizer { - Tokenizer::Json(JsonField) + fn tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer { + FieldTokenizer::Json(JsonField) } fn add_html_tantivy( @@ -1197,12 +1198,12 @@ impl TextField for CleanBodyBigrams { CleanBody.into() } - fn tokenizer(&self, _: Option<&whatlang::Lang>) -> Tokenizer { - Tokenizer::Bigram(BigramTokenizer::default()) + fn tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer { + FieldTokenizer::Bigram(BigramTokenizer::default()) } - fn query_tokenizer(&self, _: Option<&whatlang::Lang>) -> Tokenizer { - Tokenizer::default() + fn query_tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer { + FieldTokenizer::default() } fn is_searchable(&self) -> bool { @@ -1241,12 +1242,12 @@ impl TextField for TitleBigrams { Title.into() } - fn tokenizer(&self, _: Option<&whatlang::Lang>) -> Tokenizer { - Tokenizer::Bigram(BigramTokenizer::default()) + fn tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer { + FieldTokenizer::Bigram(BigramTokenizer::default()) } - fn query_tokenizer(&self, _: Option<&whatlang::Lang>) -> Tokenizer { - Tokenizer::default() + fn query_tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer { + FieldTokenizer::default() } fn is_searchable(&self) -> bool { @@ -1291,12 +1292,12 @@ impl TextField for CleanBodyTrigrams { CleanBody.into() } - fn tokenizer(&self, _: Option<&whatlang::Lang>) -> Tokenizer { - Tokenizer::Trigram(TrigramTokenizer::default()) + fn tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer { + FieldTokenizer::Trigram(TrigramTokenizer::default()) } - fn query_tokenizer(&self, _: Option<&whatlang::Lang>) -> Tokenizer { - Tokenizer::default() + fn query_tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer { + FieldTokenizer::default() } fn is_searchable(&self) -> bool { @@ -1335,12 +1336,12 @@ impl TextField for TitleTrigrams { Title.into() } - fn tokenizer(&self, _: Option<&whatlang::Lang>) -> Tokenizer { - Tokenizer::Trigram(TrigramTokenizer::default()) + fn tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer { + FieldTokenizer::Trigram(TrigramTokenizer::default()) } - fn query_tokenizer(&self, _: Option<&whatlang::Lang>) -> Tokenizer { - Tokenizer::default() + fn query_tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer { + FieldTokenizer::default() } fn is_searchable(&self) -> bool { @@ -1405,8 +1406,8 @@ impl TextField for SafetyClassification { "safety_classification" } - fn tokenizer(&self, _: Option<&whatlang::Lang>) -> Tokenizer { - Tokenizer::Identity(Identity {}) + fn tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer { + FieldTokenizer::Identity(Identity {}) } fn add_html_tantivy( @@ -1447,8 +1448,8 @@ impl TextField for InsertionTimestamp { "insertion_timestamp" } - fn tokenizer(&self, _: Option<&whatlang::Lang>) -> Tokenizer { - Tokenizer::Identity(Identity {}) + fn tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer { + FieldTokenizer::Identity(Identity {}) } fn indexing_option(&self) -> IndexingOption { @@ -1490,8 +1491,8 @@ impl TextField for RecipeFirstIngredientTagId { "recipe_first_ingredient_tag_id" } - fn tokenizer(&self, _: Option<&whatlang::Lang>) -> Tokenizer { - Tokenizer::Identity(Identity {}) + fn tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer { + FieldTokenizer::Identity(Identity {}) } fn is_stored(&self) -> bool { @@ -1563,8 +1564,8 @@ impl TextField for Links { true } - fn tokenizer(&self, _: Option<&whatlang::Lang>) -> Tokenizer { - Tokenizer::Url(UrlTokenizer) + fn tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer { + FieldTokenizer::Url(UrlTokenizer) } fn add_html_tantivy( diff --git a/crates/core/src/simhash.rs b/crates/core/src/simhash.rs index 08b275b4..20e15561 100644 --- a/crates/core/src/simhash.rs +++ b/crates/core/src/simhash.rs @@ -19,7 +19,7 @@ use std::{ hash::{Hash, Hasher}, }; -use crate::tokenizer::Tokenizer; +use crate::tokenizer::FieldTokenizer; pub type HashType = u64; @@ -30,7 +30,7 @@ fn hash_token(token: &tantivy::tokenizer::Token) -> HashType { } pub fn hash(text: &str) -> HashType { - let mut tokenizer = Tokenizer::default(); + let mut tokenizer = FieldTokenizer::default(); let mut stream = tantivy::tokenizer::Tokenizer::token_stream(&mut tokenizer, text); diff --git a/crates/core/src/snippet.rs b/crates/core/src/snippet.rs index aed2b2ab..8dfac08a 100644 --- a/crates/core/src/snippet.rs +++ b/crates/core/src/snippet.rs @@ -19,7 +19,9 @@ use std::ops::Range; use crate::config::SnippetConfig; use crate::highlighted::{HighlightedFragment, HighlightedKind}; use crate::query::Query; -use crate::tokenizer::{BigramTokenizer, Normal, Stemmed, Tokenizer, TrigramTokenizer}; +use crate::tokenizer::fields::{ + BigramTokenizer, DefaultTokenizer, FieldTokenizer, Stemmed, TrigramTokenizer, +}; use crate::web_spell::sentence_ranges; use crate::webpage::region::Region; use hashbrown::{HashMap, HashSet}; @@ -81,10 +83,10 @@ struct SnippetBuilder { impl SnippetBuilder { fn highlight(&mut self, terms: &HashSet, lang: whatlang::Lang) { for mut tokenizer in [ - Tokenizer::Stemmed(Stemmed::with_forced_language(lang)), - Tokenizer::Normal(Normal::default()), - Tokenizer::Bigram(BigramTokenizer::default()), - Tokenizer::Trigram(TrigramTokenizer::default()), + FieldTokenizer::Stemmed(Stemmed::with_forced_language(lang)), + FieldTokenizer::Default(DefaultTokenizer::default()), + FieldTokenizer::Bigram(BigramTokenizer::default()), + FieldTokenizer::Trigram(TrigramTokenizer::default()), ] { let mut stream = tantivy::tokenizer::Tokenizer::token_stream(&mut tokenizer, &self.fragment); @@ -143,7 +145,11 @@ impl SnippetBuilder { } } -fn passages(text: &str, mut tokenizer: Tokenizer, config: &SnippetConfig) -> Vec { +fn passages( + text: &str, + mut tokenizer: FieldTokenizer, + config: &SnippetConfig, +) -> Vec { sentence_ranges(text) .into_iter() .filter(|offset| offset.end - offset.start > config.min_passage_width) @@ -216,7 +222,7 @@ fn snippet_string_builder( terms: &[String], lang: whatlang::Lang, config: SnippetConfig, - mut tokenizer: Tokenizer, + mut tokenizer: FieldTokenizer, ) -> SnippetBuilder { let terms: HashSet = terms .iter() @@ -288,7 +294,7 @@ fn snippet_string( lang: whatlang::Lang, config: SnippetConfig, ) -> TextSnippet { - let tokenizer = Tokenizer::Normal(Normal::default()); + let tokenizer = FieldTokenizer::Default(DefaultTokenizer::default()); let snip = snippet_string_builder(text, terms, lang, config.clone(), tokenizer).build(); if !snip.fragments.is_empty() @@ -300,7 +306,7 @@ fn snippet_string( return snip; } - let tokenizer = Tokenizer::Stemmed(Stemmed::with_forced_language(lang)); + let tokenizer = FieldTokenizer::Stemmed(Stemmed::with_forced_language(lang)); snippet_string_builder(text, terms, lang, config, tokenizer).build() } @@ -346,6 +352,7 @@ mod tests { searcher::{LocalSearcher, SearchQuery}, webpage::Webpage, }; + use proptest::prelude::*; const TEST_TEXT: &str = r#"Rust is a systems programming language sponsored by Mozilla which describes it as a "safe, concurrent, practical language", supporting functional and @@ -546,7 +553,7 @@ Survey in 2016, 2017, and 2018."#; &["thisis".to_string()], whatlang::Lang::Eng, SnippetConfig::default(), - Tokenizer::Normal(Normal::default()), + FieldTokenizer::Default(DefaultTokenizer::default()), ); let mut terms = HashSet::new(); @@ -561,4 +568,12 @@ Survey in 2016, 2017, and 2018."#; "this is a test" ); } + + proptest! { + #[test] + fn prop_snippet_gen(text: String, query: String) { + let terms = query.split_whitespace().map(|s| s.to_string()).collect::>(); + let _ = snippet_string(&text, &terms, whatlang::Lang::Eng, SnippetConfig::default()); + } + } } diff --git a/crates/core/src/tokenizer/fields/bigram.rs b/crates/core/src/tokenizer/fields/bigram.rs new file mode 100644 index 00000000..548ff74e --- /dev/null +++ b/crates/core/src/tokenizer/fields/bigram.rs @@ -0,0 +1,80 @@ +// Stract is an open source web search engine. +// Copyright (C) 2024 Stract ApS +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use tantivy::tokenizer::BoxTokenStream; + +use super::{default::DefaultTokenizer, ngram::NGramTokenStream}; + +#[derive(Clone)] +pub struct BigramTokenizer { + inner_tokenizer: DefaultTokenizer, +} + +impl Default for BigramTokenizer { + fn default() -> Self { + Self { + inner_tokenizer: DefaultTokenizer::with_stopwords(vec![]), + } + } +} + +impl BigramTokenizer { + pub fn as_str() -> &'static str { + "bigram_tokenizer" + } +} +impl tantivy::tokenizer::Tokenizer for BigramTokenizer { + type TokenStream<'a> = BoxTokenStream<'a>; + + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + let inner_stream = self.inner_tokenizer.token_stream(text); + let stream: NGramTokenStream<2> = NGramTokenStream::new(inner_stream); + BoxTokenStream::new(stream) + } +} + +#[cfg(test)] +mod tests { + use tantivy::tokenizer::Tokenizer; + + use super::*; + fn tokenize_bigram(s: &str) -> Vec { + let mut res = Vec::new(); + let mut tokenizer = BigramTokenizer::default(); + let mut stream = tokenizer.token_stream(s); + + while let Some(token) = stream.next() { + res.push(token.text.clone()); + } + + res + } + + #[test] + fn bigram_tokenizer() { + assert!(tokenize_bigram("").is_empty()); + assert!(tokenize_bigram("test").is_empty()); + + assert_eq!(tokenize_bigram("this is"), vec!["thisis"]); + assert_eq!(tokenize_bigram("this is a"), vec!["thisis", "isa",]); + assert_eq!( + tokenize_bigram("this is a test"), + vec!["thisis", "isa", "atest",] + ); + + assert_eq!(tokenize_bigram("this.is"), vec!["this.", ".is"]); + } +} diff --git a/crates/core/src/tokenizer/fields/default.rs b/crates/core/src/tokenizer/fields/default.rs new file mode 100644 index 00000000..5c0476bd --- /dev/null +++ b/crates/core/src/tokenizer/fields/default.rs @@ -0,0 +1,217 @@ +// Stract is an open source web search engine. +// Copyright (C) 2024 Stract ApS +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use tantivy::tokenizer::{BoxTokenStream, LowerCaser, StopWordFilter, TextAnalyzer}; + +use crate::tokenizer::{self, Tokenize}; + +#[derive(Clone, Default)] +pub struct DefaultTokenizer { + stopwords: Option>, + analyzer: Option, +} + +impl DefaultTokenizer { + pub fn as_str() -> &'static str { + "tokenizer" + } + + pub fn with_stopwords(stopwords: Vec) -> Self { + Self { + stopwords: Some(stopwords), + analyzer: None, + } + } +} +impl tantivy::tokenizer::Tokenizer for DefaultTokenizer { + type TokenStream<'a> = BoxTokenStream<'a>; + + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + let builder = TextAnalyzer::builder(Normal).filter(LowerCaser); + + self.analyzer = if let Some(stopwords) = &self.stopwords { + Some( + builder + .filter(StopWordFilter::remove(stopwords.clone())) + .build(), + ) + } else { + Some(builder.build()) + }; + + self.analyzer.as_mut().unwrap().token_stream(text) + } +} + +#[derive(Clone)] +pub struct Normal; + +pub struct NormalTokenStream<'a> { + stream: Box + 'a>, + token: Option, + next_position: usize, +} + +impl tantivy::tokenizer::Tokenizer for Normal { + type TokenStream<'a> = BoxTokenStream<'a>; + + fn token_stream<'a>(&mut self, text: &'a str) -> Self::TokenStream<'a> { + let stream = Box::new(text.tokenize()); + + BoxTokenStream::new(NormalTokenStream::new_boxed(stream)) + } +} + +impl<'a> tantivy::tokenizer::TokenStream for NormalTokenStream<'a> { + fn advance(&mut self) -> bool { + self.token = self.stream.next().map(|token| { + let span = token.span(); + let pos = self.next_position; + self.next_position += 1; + tantivy::tokenizer::Token { + offset_from: span.start, + offset_to: span.end, + position: pos, + text: token.text().to_string(), + ..Default::default() + } + }); + + self.token.is_some() + } + + fn token(&self) -> &tantivy::tokenizer::Token { + self.token.as_ref().unwrap() + } + + fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token { + self.token.as_mut().unwrap() + } +} + +impl<'a> NormalTokenStream<'a> { + fn new_boxed(stream: Box + 'a>) -> BoxTokenStream<'a> { + BoxTokenStream::new(Self { + stream, + token: None, + next_position: 0, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use proptest::prelude::*; + use tantivy::tokenizer::Tokenizer as _; + + fn tokenize_default(s: &str) -> Vec { + let mut res = Vec::new(); + let mut tokenizer = DefaultTokenizer::default(); + let mut stream = tokenizer.token_stream(s); + + while let Some(token) = stream.next() { + res.push(token.text.clone()); + } + + res + } + + #[test] + fn default_tokenization() { + assert_eq!( + tokenize_default("this is a relatively simple123 test string"), + vec![ + "this", + "is", + "a", + "relatively", + "simple123", + "test", + "string" + ] + ); + } + + #[test] + fn special_character_tokenization() { + assert_eq!( + tokenize_default("example.com"), + vec!["example", ".", "com",] + ); + assert_eq!( + tokenize_default("example. com"), + vec!["example", ".", "com",] + ); + assert_eq!( + tokenize_default("example . com"), + vec!["example", ".", "com",] + ); + + assert_eq!( + tokenize_default("a c++ blog post"), + vec!["a", "c", "+", "+", "blog", "post"] + ); + assert_eq!(tokenize_default("path/test"), vec!["path", "/", "test",]); + } + + #[test] + fn han() { + assert_eq!( + tokenize_default("test 漢.com"), + vec!["test", "漢", ".", "com"] + ); + } + + #[test] + fn hiragana() { + assert_eq!( + tokenize_default("test あ.com"), + vec!["test", "あ", ".", "com"] + ); + } + + #[test] + fn katakana() { + assert_eq!( + tokenize_default("test ダ.com"), + vec!["test", "ダ", ".", "com"] + ); + } + + #[test] + fn cyrillic() { + assert_eq!( + tokenize_default("test б.com"), + vec!["test", "б", ".", "com"] + ); + } + + #[test] + fn arabic() { + assert_eq!( + tokenize_default("test ب.com"), + vec!["test", "ب", ".", "com"] + ); + } + + proptest! { + #[test] + fn prop_default_tokenization(s: String) { + let _ = tokenize_default(&s); + } + } +} diff --git a/crates/core/src/tokenizer/fields/identity.rs b/crates/core/src/tokenizer/fields/identity.rs new file mode 100644 index 00000000..957b28e6 --- /dev/null +++ b/crates/core/src/tokenizer/fields/identity.rs @@ -0,0 +1,95 @@ +use tantivy::tokenizer::BoxTokenStream; + +// Stract is an open source web search engine. +// Copyright (C) 2024 Stract ApS +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . +#[derive(Clone, Default, Debug)] +pub struct Identity {} + +impl Identity { + pub fn as_str() -> &'static str { + "identity_tokenizer" + } +} +impl tantivy::tokenizer::Tokenizer for Identity { + type TokenStream<'a> = BoxTokenStream<'a>; + + fn token_stream<'a>(&mut self, text: &'a str) -> Self::TokenStream<'a> { + BoxTokenStream::new(IdentityTokenStream::from(text.to_string())) + } +} +pub struct IdentityTokenStream { + num_advances: usize, + token: Option, +} + +impl From for IdentityTokenStream { + fn from(text: String) -> Self { + Self { + num_advances: 0, + token: Some(tantivy::tokenizer::Token { + offset_from: 0, + offset_to: text.len(), + position: 0, + text, + ..Default::default() + }), + } + } +} +impl tantivy::tokenizer::TokenStream for IdentityTokenStream { + fn advance(&mut self) -> bool { + self.num_advances += 1; + + if self.num_advances == 1 { + true + } else { + self.token = None; + false + } + } + + fn token(&self) -> &tantivy::tokenizer::Token { + self.token.as_ref().unwrap() + } + + fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token { + self.token.as_mut().unwrap() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tantivy::tokenizer::Tokenizer as _; + + fn tokenize_identity(s: &str) -> Vec { + let mut res = Vec::new(); + let mut tokenizer = Identity {}; + let mut stream = tokenizer.token_stream(s); + + while let Some(token) = stream.next() { + res.push(token.text.clone()); + } + + res + } + + #[test] + fn identity() { + assert_eq!(tokenize_identity("this is a test"), vec!["this is a test"]); + assert_eq!(tokenize_identity("a-b"), vec!["a-b"]); + } +} diff --git a/crates/core/src/tokenizer/fields/json.rs b/crates/core/src/tokenizer/fields/json.rs new file mode 100644 index 00000000..5dd551d1 --- /dev/null +++ b/crates/core/src/tokenizer/fields/json.rs @@ -0,0 +1,356 @@ +// Stract is an open source web search engine. +// Copyright (C) 2024 Stract ApS +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use std::str::CharIndices; + +use tantivy::tokenizer::BoxTokenStream; + +use crate::{ceil_char_boundary, floor_char_boundary}; + +pub struct FlattenedJson { + flattened_json: String, + inner_tokenizer: JsonField, +} + +struct IntermediateFlatValue { + parent_keys: Vec, + val: serde_json::Value, +} + +fn flatten(val: serde_json::Value) -> Vec { + let mut res = Vec::new(); + + let mut stack = Vec::new(); + stack.push(IntermediateFlatValue { + parent_keys: Vec::new(), + val, + }); + + while let Some(elem) = stack.pop() { + match elem.val { + serde_json::Value::Null => { + res.push(itertools::intersperse(elem.parent_keys, ".".to_string()).collect()) + } + serde_json::Value::Bool(b) => { + let key: String = + itertools::intersperse(elem.parent_keys, ".".to_string()).collect(); + res.push(format!("{key}=\"{b}\"")) + } + serde_json::Value::Number(n) => { + let key: String = + itertools::intersperse(elem.parent_keys, ".".to_string()).collect(); + res.push(format!("{key}=\"{n}\"")) + } + serde_json::Value::String(s) => { + let key: String = + itertools::intersperse(elem.parent_keys, ".".to_string()).collect(); + res.push(format!("{key}=\"{}\"", s.replace('"', "\\\""))) + } + serde_json::Value::Array(arr) => { + for item in arr { + stack.push(IntermediateFlatValue { + parent_keys: elem.parent_keys.clone(), + val: item, + }); + } + } + serde_json::Value::Object(map) => { + for (key, val) in map { + let mut parent_keys = elem.parent_keys.clone(); + parent_keys.push(key); + + stack.push(IntermediateFlatValue { parent_keys, val }); + } + } + } + } + + res.reverse(); + + res +} + +impl FlattenedJson { + pub fn new(value: &T) -> crate::Result + where + T: serde::Serialize, + { + let json = serde_json::to_string(value)?; + let val: serde_json::Value = serde_json::from_str(&json)?; + + let flattened_json = itertools::intersperse(flatten(val), "\n".to_string()).collect(); + + Ok(Self { + flattened_json, + inner_tokenizer: JsonField, + }) + } + + pub fn token_stream(&mut self) -> BoxTokenStream { + tantivy::tokenizer::Tokenizer::token_stream(&mut self.inner_tokenizer, &self.flattened_json) + } + + pub fn text(&self) -> &str { + &self.flattened_json + } +} + +#[derive(Clone, Debug)] +pub struct JsonField; + +impl JsonField { + pub fn as_str() -> &'static str { + "json_tokenizer" + } +} + +impl tantivy::tokenizer::Tokenizer for JsonField { + type TokenStream<'a> = BoxTokenStream<'a>; + + fn token_stream<'a>(&mut self, text: &'a str) -> Self::TokenStream<'a> { + BoxTokenStream::new(JsonFieldTokenStream { + text, + chars: text.char_indices(), + token: tantivy::tokenizer::Token::default(), + }) + } +} + +pub struct JsonFieldTokenStream<'a> { + text: &'a str, + chars: CharIndices<'a>, + token: tantivy::tokenizer::Token, +} + +impl<'a> JsonFieldTokenStream<'a> { + // search for the end of the current token. + fn search_token_end(&mut self, is_quote: bool) -> usize { + let mut escaped = false; + for (offset, c) in self.chars.by_ref() { + if is_quote { + if c == '\\' { + escaped = true; + } else { + if c == '"' && !escaped { + return offset; + } + + escaped = false; + } + } else if !c.is_alphanumeric() { + return offset; + } + } + + self.text.len() + } +} + +impl<'a> tantivy::tokenizer::TokenStream for JsonFieldTokenStream<'a> { + fn advance(&mut self) -> bool { + self.token.text.clear(); + self.token.position = self.token.position.wrapping_add(1); + let mut prev_was_quote = false; + + while let Some((offset_from, c)) = self.chars.next() { + if !matches!(c, '.' | '\n' | '"') { + let offset_to = self.search_token_end(prev_was_quote); + self.token.offset_from = offset_from; + self.token.offset_to = offset_to; + + if prev_was_quote { + self.token.offset_from -= 1; + self.token.offset_to += 1; + + self.token.offset_from = floor_char_boundary(self.text, self.token.offset_from); + self.token.offset_to = + ceil_char_boundary(self.text, self.token.offset_to).min(self.text.len()); + } + + self.token + .text + .push_str(&self.text[self.token.offset_from..self.token.offset_to]); + return true; + } + + prev_was_quote = c == '"'; + } + false + } + + fn token(&self) -> &tantivy::tokenizer::Token { + &self.token + } + + fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token { + &mut self.token + } +} + +#[cfg(test)] +mod tests { + use tantivy::tokenizer::Tokenizer; + + use super::*; + + fn tokenize_json(s: &str) -> Vec { + let mut res = Vec::new(); + let mut tokenizer = JsonField; + let mut stream = tokenizer.token_stream(s); + + while let Some(token) = stream.next() { + res.push(token.text.clone()); + } + + res + } + + #[test] + fn tokenize_json_field() { + assert_eq!( + tokenize_json(r#"Test.field="value""#), + vec!["Test", "field", "\"value\"",] + ); + assert_eq!( + tokenize_json(r#"Test.field="this is the value""#), + vec!["Test", "field", "\"this is the value\"",] + ); + assert_eq!( + tokenize_json(r#"Test.field="this is\" the value""#), + vec!["Test", "field", "\"this is\\\" the value\"",] + ); + assert_eq!( + tokenize_json("Test.field=\"this*@# is\\\" the\\\" \nvalue\""), + vec!["Test", "field", "\"this*@# is\\\" the\\\" \nvalue\"",] + ); + } + + fn flattened_json_helper(json: &str, expected: &str) { + let parsed: serde_json::Value = serde_json::from_str(json).unwrap(); + let flattened = FlattenedJson::new(&parsed).unwrap(); + let flat = flattened.text(); + + assert_eq!(flat, expected); + } + + #[test] + fn flatten_json_object() { + let json = r#" + { + "key1": "val1", + "key2": "val2" + } + "#; + let expected = r#"key1="val1" +key2="val2""#; + + flattened_json_helper(json, expected); + + let json = r#" + { + "key1": 1, + "key2": 2 + } + "#; + let expected = r#"key1="1" +key2="2""#; + + flattened_json_helper(json, expected); + + let json = r#" + { + "key1": { + "key2": "value1", + "key3": "value2" + } + } + "#; + let expected = r#"key1.key2="value1" +key1.key3="value2""#; + + flattened_json_helper(json, expected); + + let json = r#" + { + "$key1": { + "$key2": "value1", + "key3": "value2" + } + } + "#; + let expected = r#"$key1.$key2="value1" +$key1.key3="value2""#; + + flattened_json_helper(json, expected); + + let json = r#" + { + "key1": [ + "value1", + "value2" + ] + } + "#; + let expected = r#"key1="value1" +key1="value2""#; + + flattened_json_helper(json, expected); + + let json = r#" + { + "key1": [ + "value1", + { + "key2": "value2", + "key3": 123 + } + ] + } + "#; + let expected = r#"key1="value1" +key1.key2="value2" +key1.key3="123""#; + + flattened_json_helper(json, expected); + + let json = r#" + { + "key1": [ + "value1", + { + "key2": "this\" is @ a # test" + } + ] + } + "#; + let expected = r#"key1="value1" +key1.key2="this\" is @ a # test""#; + + flattened_json_helper(json, expected); + } + + #[test] + fn out_of_bounds_crash() { + tokenize_json( + r#" +Breadcrumb.title="Home" +Breadcrumb.url="https://www.eurotecnicaservice.it/?lang=en" +Breadcrumb.title="Fuser Pur" +Breadcrumb.url="https://www.eurotecnicaservice.it/testing\" +"#, + ); + } +} diff --git a/crates/core/src/tokenizer/fields/mod.rs b/crates/core/src/tokenizer/fields/mod.rs new file mode 100644 index 00000000..1558c708 --- /dev/null +++ b/crates/core/src/tokenizer/fields/mod.rs @@ -0,0 +1,84 @@ +// Stract is an open source web search engine. +// Copyright (C) 2024 Stract ApS +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use tantivy::tokenizer::BoxTokenStream; + +pub use self::{ + bigram::BigramTokenizer, default::DefaultTokenizer, identity::Identity, json::FlattenedJson, + json::JsonField, stemmed::Stemmed, trigram::TrigramTokenizer, url::UrlTokenizer, +}; + +mod default; +mod identity; +mod json; +mod stemmed; +mod url; + +mod bigram; +mod ngram; +mod trigram; + +#[derive(Clone)] +pub enum FieldTokenizer { + Default(DefaultTokenizer), + Identity(Identity), + Stemmed(Stemmed), + Bigram(BigramTokenizer), + Trigram(TrigramTokenizer), + Json(JsonField), + Url(UrlTokenizer), +} + +impl FieldTokenizer { + pub fn as_str(&self) -> &'static str { + match self { + FieldTokenizer::Default(_) => DefaultTokenizer::as_str(), + FieldTokenizer::Stemmed(_) => Stemmed::as_str(), + FieldTokenizer::Identity(_) => Identity::as_str(), + FieldTokenizer::Bigram(_) => BigramTokenizer::as_str(), + FieldTokenizer::Trigram(_) => TrigramTokenizer::as_str(), + FieldTokenizer::Json(_) => JsonField::as_str(), + FieldTokenizer::Url(_) => UrlTokenizer::as_str(), + } + } +} + +impl From for FieldTokenizer { + fn from(stemmed: Stemmed) -> Self { + Self::Stemmed(stemmed) + } +} + +impl Default for FieldTokenizer { + fn default() -> Self { + Self::Default(DefaultTokenizer::default()) + } +} +impl tantivy::tokenizer::Tokenizer for FieldTokenizer { + type TokenStream<'a> = BoxTokenStream<'a>; + + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + match self { + FieldTokenizer::Default(tokenizer) => tokenizer.token_stream(text), + FieldTokenizer::Stemmed(tokenizer) => tokenizer.token_stream(text), + FieldTokenizer::Identity(tokenizer) => tokenizer.token_stream(text), + FieldTokenizer::Json(tokenizer) => tokenizer.token_stream(text), + FieldTokenizer::Bigram(tokenizer) => tokenizer.token_stream(text), + FieldTokenizer::Trigram(tokenizer) => tokenizer.token_stream(text), + FieldTokenizer::Url(tokenizer) => tokenizer.token_stream(text), + } + } +} diff --git a/crates/core/src/tokenizer/fields/ngram.rs b/crates/core/src/tokenizer/fields/ngram.rs new file mode 100644 index 00000000..369972b2 --- /dev/null +++ b/crates/core/src/tokenizer/fields/ngram.rs @@ -0,0 +1,92 @@ +// Stract is an open source web search engine. +// Copyright (C) 2024 Stract ApS +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use tantivy::tokenizer::BoxTokenStream; + +pub struct NGramTokenStream<'a, const N: usize> { + inner: BoxTokenStream<'a>, + token: tantivy::tokenizer::Token, + token_window: [tantivy::tokenizer::Token; N], + next_pos: usize, +} + +impl<'a, const N: usize> NGramTokenStream<'a, N> { + pub fn new(inner: BoxTokenStream<'a>) -> Self { + Self { + inner, + token: tantivy::tokenizer::Token::default(), + token_window: std::array::from_fn(|_| tantivy::tokenizer::Token::default()), + next_pos: 0, + } + } +} + +fn reuse_token_alloc(token: &mut tantivy::tokenizer::Token, new_token: &tantivy::tokenizer::Token) { + token.text.clear(); + token.text += new_token.text.as_str(); + token.offset_from = new_token.offset_from; + token.offset_to = new_token.offset_to; + token.position = new_token.position; + token.position_length = new_token.position_length; +} + +impl<'a, const N: usize> tantivy::tokenizer::TokenStream for NGramTokenStream<'a, N> { + fn advance(&mut self) -> bool { + if !self.inner.advance() { + return false; + } + + self.token_window.rotate_left(1); + reuse_token_alloc(&mut self.token_window[N - 1], self.inner.token()); + + while self.token_window[0].text.is_empty() { + if !self.inner.advance() { + return false; + } + + self.token_window.rotate_left(1); + reuse_token_alloc(&mut self.token_window[N - 1], self.inner.token()); + } + + self.next_pos += 1; + + let begin = self + .token_window + .iter() + .position(|token| !token.text.is_empty()) + .unwrap_or(N - 1); + + self.token.position = self.next_pos; + self.token.offset_from = self.token_window[begin].offset_from; + self.token.offset_to = self.token_window[N - 1].offset_to; + self.token.position_length = N - begin; + + self.token.text.clear(); + for token in &self.token_window { + self.token.text += token.text.as_str(); + } + + true + } + + fn token(&self) -> &tantivy::tokenizer::Token { + &self.token + } + + fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token { + &mut self.token + } +} diff --git a/crates/core/src/tokenizer/fields/stemmed.rs b/crates/core/src/tokenizer/fields/stemmed.rs new file mode 100644 index 00000000..77af3df8 --- /dev/null +++ b/crates/core/src/tokenizer/fields/stemmed.rs @@ -0,0 +1,59 @@ +// Stract is an open source web search engine. +// Copyright (C) 2024 Stract ApS +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use tantivy::tokenizer::{BoxTokenStream, LowerCaser, TextAnalyzer}; +use whatlang::Lang; + +use crate::tokenizer::stemmer::Stemmer; + +use super::default::Normal; + +#[derive(Clone, Default)] +pub struct Stemmed { + force_language: Option, + analyzer: Option, +} + +impl Stemmed { + pub fn as_str() -> &'static str { + "stemmed_tokenizer" + } + pub fn with_forced_language(lang: Lang) -> Self { + Self { + force_language: Some(lang), + analyzer: None, + } + } +} +impl tantivy::tokenizer::Tokenizer for Stemmed { + type TokenStream<'a> = BoxTokenStream<'a>; + + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + let builder = TextAnalyzer::builder(Normal).filter(LowerCaser); + + let lang = match self.force_language { + Some(lang) => Some(lang), + None => whatlang::detect_lang(text), + }; + + self.analyzer = match lang { + Some(lang) => Some(builder.filter(Stemmer::from(lang).into_tantivy()).build()), + None => Some(builder.build()), + }; + + self.analyzer.as_mut().unwrap().token_stream(text) + } +} diff --git a/crates/core/src/tokenizer/fields/trigram.rs b/crates/core/src/tokenizer/fields/trigram.rs new file mode 100644 index 00000000..54fc0884 --- /dev/null +++ b/crates/core/src/tokenizer/fields/trigram.rs @@ -0,0 +1,79 @@ +use tantivy::tokenizer::BoxTokenStream; + +use super::{default::DefaultTokenizer, ngram::NGramTokenStream}; + +// Stract is an open source web search engine. +// Copyright (C) 2024 Stract ApS +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . +#[derive(Clone)] +pub struct TrigramTokenizer { + inner_tokenizer: DefaultTokenizer, +} + +impl Default for TrigramTokenizer { + fn default() -> Self { + Self { + inner_tokenizer: DefaultTokenizer::with_stopwords(vec![]), + } + } +} + +impl TrigramTokenizer { + pub fn as_str() -> &'static str { + "trigram_tokenizer" + } +} +impl tantivy::tokenizer::Tokenizer for TrigramTokenizer { + type TokenStream<'a> = BoxTokenStream<'a>; + + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + let inner = self.inner_tokenizer.token_stream(text); + let stream: NGramTokenStream<3> = NGramTokenStream::new(inner); + BoxTokenStream::new(stream) + } +} + +#[cfg(test)] +mod tests { + use tantivy::tokenizer::Tokenizer; + + use super::*; + + fn tokenize_trigram(s: &str) -> Vec { + let mut res = Vec::new(); + + let mut tokenizer = TrigramTokenizer::default(); + let mut stream = tokenizer.token_stream(s); + + while let Some(token) = stream.next() { + res.push(token.text.clone()); + } + + res + } + + #[test] + fn trigram_tokenizer() { + assert!(tokenize_trigram("").is_empty()); + assert!(tokenize_trigram("test").is_empty()); + assert!(tokenize_trigram("this is").is_empty()); + + assert_eq!(tokenize_trigram("this is a"), vec!["thisisa",]); + assert_eq!( + tokenize_trigram("this is a test"), + vec!["thisisa", "isatest"] + ); + } +} diff --git a/crates/core/src/tokenizer/fields/url.rs b/crates/core/src/tokenizer/fields/url.rs new file mode 100644 index 00000000..e2560dc6 --- /dev/null +++ b/crates/core/src/tokenizer/fields/url.rs @@ -0,0 +1,264 @@ +// Stract is an open source web search engine. +// Copyright (C) 2024 Stract ApS +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use std::collections::VecDeque; + +use tantivy::tokenizer::BoxTokenStream; + +use crate::tokenizer::{add_space_last::AddSpaceLast, split_preserve::StrSplitPreserve}; + +#[derive(Clone, Default)] +struct ParsedUrl { + protocol: Option>, + domain: Option>, + path: VecDeque, +} + +#[derive(Debug, Clone)] +pub struct UrlTokenizer; + +impl UrlTokenizer { + pub fn as_str() -> &'static str { + "url_tokenizer" + } + + fn parse_url(text: &str) -> ParsedUrl { + url::Url::parse(text) + .or_else(|_| url::Url::parse(&format!("http://{}", text))) + .map(|url| { + let domain = Some( + url.host_str() + .unwrap_or("") + .split_preserve(|c| matches!(c, '.')) + .filter(|s| !(*s).is_empty()) + .map(|s| s.to_string()) + .add_space_last() + .collect(), + ); + let path: VecDeque<_> = url + .path() + .split_preserve(|c| matches!(c, '/' | '-' | '_')) + .filter(|s| !(*s).is_empty()) + .map(|s| s.to_string()) + .collect(); + + if matches!(url.scheme(), "http" | "https") { + ParsedUrl { + protocol: None, + domain, + path, + } + } else { + let mut v = VecDeque::new(); + v.push_back(url.scheme().to_string()); + + ParsedUrl { + protocol: Some(v), + domain, + path, + } + } + }) + .unwrap_or_default() + } +} + +impl tantivy::tokenizer::Tokenizer for UrlTokenizer { + type TokenStream<'a> = BoxTokenStream<'a>; + + fn token_stream<'a>(&mut self, text: &'a str) -> Self::TokenStream<'a> { + let text = text.replace(' ', "%20"); + + let urls = text + .split('\n') + .filter(|s| !s.is_empty()) + .map(|s| s.to_lowercase()) + .map(|s| Self::parse_url(&s)) + .collect(); + + BoxTokenStream::new(SiteOperatorUrlTokenStream::new(urls)) + } +} + +pub struct SiteOperatorUrlTokenStream { + urls: VecDeque, + current_url: ParsedUrl, + token: tantivy::tokenizer::Token, +} + +impl SiteOperatorUrlTokenStream { + fn new(mut urls: VecDeque) -> Self { + let current_url = urls.pop_front().unwrap_or_default(); + + Self { + urls, + current_url, + token: tantivy::tokenizer::Token::default(), + } + } + + fn advance_current_url(&mut self) -> bool { + if let Some(protocol) = self.current_url.protocol.as_mut() { + self.token.position = self.token.position.wrapping_add(1); + self.token.text.clear(); + + if let Some(s) = protocol.pop_front() { + self.token.text.push_str(&s); + self.token.offset_from = 0; + self.token.offset_to = s.len(); + } else { + self.token.offset_from = self.token.offset_to; + self.token.text.push_str("://"); + self.token.offset_to += self.token.text.len(); + + self.current_url.protocol = None; + } + + return true; + } + + if let Some(domain) = self.current_url.domain.as_mut() { + if let Some(s) = domain.pop_front() { + self.token.text.clear(); + self.token.position = self.token.position.wrapping_add(1); + + self.token.text.push_str(&s); + + self.token.offset_from = self.token.offset_to; + self.token.offset_to += self.token.text.len(); + return true; + } + } + + if let Some(s) = self.current_url.path.pop_front() { + self.token.text.clear(); + self.token.position = self.token.position.wrapping_add(1); + + self.token.text.push_str(&s); + self.token.offset_from = self.token.offset_to; + self.token.offset_to += self.token.text.len(); + + return true; + } + + false + } + + fn next_url(&mut self) -> bool { + if let Some(url) = self.urls.pop_front() { + self.current_url = url; + + self.token.position = self.token.position.wrapping_add(1); + self.token.text.clear(); + self.token.text.push('\n'); + + self.token.offset_from = self.token.offset_to; + self.token.offset_to += self.token.text.len(); + + true + } else { + false + } + } +} + +impl tantivy::tokenizer::TokenStream for SiteOperatorUrlTokenStream { + fn advance(&mut self) -> bool { + if self.advance_current_url() { + return true; + } + + self.next_url() + } + + fn token(&self) -> &tantivy::tokenizer::Token { + &self.token + } + + fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token { + &mut self.token + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tantivy::tokenizer::Tokenizer as _; + + fn tokenize_url(s: &str) -> Vec { + let mut res = Vec::new(); + let mut tokenizer = UrlTokenizer; + let mut stream = tokenizer.token_stream(s); + + while let Some(token) = stream.next() { + res.push(token.text.clone()); + } + + res + } + + #[test] + fn url() { + assert_eq!( + tokenize_url("https://www.example.com"), + vec!["www", ".", "example", ".", "com ", "/"] + ); + + assert_eq!( + tokenize_url("https://www.example.com/test"), + vec!["www", ".", "example", ".", "com ", "/", "test",] + ); + + assert_eq!( + tokenize_url("example.com"), + vec!["example", ".", "com ", "/"] + ); + + assert_eq!( + tokenize_url("example.com/another/path"), + vec!["example", ".", "com ", "/", "another", "/", "path",] + ); + + assert_eq!(tokenize_url(".com"), vec![".", "com ", "/"]) + } + + #[test] + fn multiple_urls() { + assert_eq!( + tokenize_url("https://www.example.com\nhttps://www.example.com"), + vec![ + "www", ".", "example", ".", "com ", "/", "\n", "www", ".", "example", ".", "com ", + "/" + ] + ); + + assert_eq!( + tokenize_url("https://www.example.com/test\nhttps://www.abcd.com"), + vec![ + "www", ".", "example", ".", "com ", "/", "test", "\n", "www", ".", "abcd", ".", + "com ", "/" + ] + ); + + assert_eq!( + tokenize_url("https://example.com/test\nhttps://www.abcd.com/test"), + vec![ + "example", ".", "com ", "/", "test", "\n", "www", ".", "abcd", ".", "com ", "/", + "test", + ] + ); + } +} diff --git a/crates/core/src/tokenizer/mod.rs b/crates/core/src/tokenizer/mod.rs index 402d5406..74e2fb3a 100644 --- a/crates/core/src/tokenizer/mod.rs +++ b/crates/core/src/tokenizer/mod.rs @@ -1,5 +1,5 @@ // Stract is an open source web search engine. -// Copyright (C) 2023 Stract ApS +// Copyright (C) 2024 Stract ApS // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as @@ -14,1138 +14,83 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use std::{array, collections::VecDeque, str::CharIndices}; - -use logos::{Lexer, Logos}; -use tantivy::tokenizer::{ - BoxTokenStream, Language, LowerCaser, Stemmer, StopWordFilter, TextAnalyzer, -}; - -use whatlang::Lang; - -use crate::{ceil_char_boundary, floor_char_boundary}; - -use self::{add_space_last::AddSpaceLast, split_preserve::StrSplitPreserve}; - mod add_space_last; +pub mod fields; +mod script; +mod script_tokenizer; +mod segmenter; mod split_preserve; +mod split_whitespace_with_range; +mod stemmer; -struct MyStemmer(Stemmer); - -impl From for MyStemmer { - fn from(lang: Lang) -> Self { - match lang { - Lang::Dan => MyStemmer(Stemmer::new(Language::Danish)), - Lang::Ara => MyStemmer(Stemmer::new(Language::Arabic)), - Lang::Nld => MyStemmer(Stemmer::new(Language::Dutch)), - Lang::Fin => MyStemmer(Stemmer::new(Language::Finnish)), - Lang::Fra => MyStemmer(Stemmer::new(Language::French)), - Lang::Deu => MyStemmer(Stemmer::new(Language::German)), - Lang::Hun => MyStemmer(Stemmer::new(Language::Hungarian)), - Lang::Ita => MyStemmer(Stemmer::new(Language::Italian)), - Lang::Por => MyStemmer(Stemmer::new(Language::Portuguese)), - Lang::Ron => MyStemmer(Stemmer::new(Language::Romanian)), - Lang::Rus => MyStemmer(Stemmer::new(Language::Russian)), - Lang::Spa => MyStemmer(Stemmer::new(Language::Spanish)), - Lang::Swe => MyStemmer(Stemmer::new(Language::Swedish)), - Lang::Tam => MyStemmer(Stemmer::new(Language::Tamil)), - Lang::Tur => MyStemmer(Stemmer::new(Language::Turkish)), - _ => MyStemmer(Stemmer::new(Language::English)), - } - } -} - -#[derive(Clone)] -pub enum Tokenizer { - Normal(Normal), - Identity(Identity), - Stemmed(Stemmed), - Bigram(BigramTokenizer), - Trigram(TrigramTokenizer), - Json(JsonField), - Url(UrlTokenizer), -} - -impl Tokenizer { - pub fn as_str(&self) -> &'static str { - match self { - Tokenizer::Normal(_) => Normal::as_str(), - Tokenizer::Stemmed(_) => Stemmed::as_str(), - Tokenizer::Identity(_) => Identity::as_str(), - Tokenizer::Bigram(_) => BigramTokenizer::as_str(), - Tokenizer::Trigram(_) => TrigramTokenizer::as_str(), - Tokenizer::Json(_) => JsonField::as_str(), - Tokenizer::Url(_) => UrlTokenizer::as_str(), - } - } -} - -impl From for Tokenizer { - fn from(stemmed: Stemmed) -> Self { - Self::Stemmed(stemmed) - } -} - -impl Default for Tokenizer { - fn default() -> Self { - Self::Normal(Normal::default()) - } -} - -#[derive(Clone, Default)] -pub struct Normal { - stopwords: Option>, - analyzer: Option, -} - -impl Normal { - pub fn as_str() -> &'static str { - "tokenizer" - } - - pub fn with_stopwords(stopwords: Vec) -> Self { - Self { - stopwords: Some(stopwords), - analyzer: None, - } - } -} - -#[derive(Clone)] -pub struct BigramTokenizer { - inner_tokenizer: Normal, -} - -impl Default for BigramTokenizer { - fn default() -> Self { - Self { - inner_tokenizer: Normal::with_stopwords(vec![".".to_string()]), - } - } -} - -impl BigramTokenizer { - pub fn as_str() -> &'static str { - "bigram_tokenizer" - } -} - -#[derive(Clone)] -pub struct TrigramTokenizer { - inner_tokenizer: Normal, -} - -impl Default for TrigramTokenizer { - fn default() -> Self { - Self { - inner_tokenizer: Normal::with_stopwords(vec![".".to_string()]), - } - } -} - -impl TrigramTokenizer { - pub fn as_str() -> &'static str { - "trigram_tokenizer" - } -} - -#[derive(Clone, Default)] -pub struct Stemmed { - force_language: Option, - analyzer: Option, -} - -impl Stemmed { - pub fn as_str() -> &'static str { - "stemmed_tokenizer" - } - pub fn with_forced_language(lang: Lang) -> Self { - Self { - force_language: Some(lang), - analyzer: None, - } - } -} - -#[derive(Clone, Default, Debug)] -pub struct Identity {} - -impl Identity { - pub fn as_str() -> &'static str { - "identity_tokenizer" - } -} - -impl tantivy::tokenizer::Tokenizer for Tokenizer { - type TokenStream<'a> = BoxTokenStream<'a>; - - fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { - match self { - Tokenizer::Normal(tokenizer) => tokenizer.token_stream(text), - Tokenizer::Stemmed(tokenizer) => tokenizer.token_stream(text), - Tokenizer::Identity(tokenizer) => tokenizer.token_stream(text), - Tokenizer::Json(tokenizer) => tokenizer.token_stream(text), - Tokenizer::Bigram(tokenizer) => tokenizer.token_stream(text), - Tokenizer::Trigram(tokenizer) => tokenizer.token_stream(text), - Tokenizer::Url(tokenizer) => tokenizer.token_stream(text), - } - } -} - -impl tantivy::tokenizer::Tokenizer for Normal { - type TokenStream<'a> = BoxTokenStream<'a>; - - fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { - let builder = TextAnalyzer::builder(Simple).filter(LowerCaser); - - self.analyzer = if let Some(stopwords) = &self.stopwords { - Some( - builder - .filter(StopWordFilter::remove(stopwords.clone())) - .build(), - ) - } else { - Some(builder.build()) - }; - - self.analyzer.as_mut().unwrap().token_stream(text) - } -} - -impl tantivy::tokenizer::Tokenizer for Stemmed { - type TokenStream<'a> = BoxTokenStream<'a>; - - fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { - let builder = TextAnalyzer::builder(Simple).filter(LowerCaser); - - let lang = match self.force_language { - Some(lang) => Some(lang), - None => whatlang::detect_lang(text), - }; - - self.analyzer = match lang { - Some(lang) => Some(builder.filter(MyStemmer::from(lang).0).build()), - None => Some(builder.build()), - }; - - self.analyzer.as_mut().unwrap().token_stream(text) - } -} - -impl tantivy::tokenizer::Tokenizer for Identity { - type TokenStream<'a> = BoxTokenStream<'a>; - - fn token_stream<'a>(&mut self, text: &'a str) -> Self::TokenStream<'a> { - BoxTokenStream::new(IdentityTokenStream::from(text.to_string())) - } -} - -impl tantivy::tokenizer::Tokenizer for BigramTokenizer { - type TokenStream<'a> = BoxTokenStream<'a>; - - fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { - let inner_stream = self.inner_tokenizer.token_stream(text); - let stream: NGramTokenStream<2> = NGramTokenStream::new(inner_stream); - BoxTokenStream::new(stream) - } -} - -impl tantivy::tokenizer::Tokenizer for TrigramTokenizer { - type TokenStream<'a> = BoxTokenStream<'a>; - - fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { - let inner = self.inner_tokenizer.token_stream(text); - let stream: NGramTokenStream<3> = NGramTokenStream::new(inner); - BoxTokenStream::new(stream) - } -} - -pub struct IdentityTokenStream { - num_advances: usize, - token: Option, -} - -impl From for IdentityTokenStream { - fn from(text: String) -> Self { - Self { - num_advances: 0, - token: Some(tantivy::tokenizer::Token { - offset_from: 0, - offset_to: text.len(), - position: 0, - text, - ..Default::default() - }), - } - } -} - -impl tantivy::tokenizer::TokenStream for IdentityTokenStream { - fn advance(&mut self) -> bool { - self.num_advances += 1; - - if self.num_advances == 1 { - true - } else { - self.token = None; - false - } - } - - fn token(&self) -> &tantivy::tokenizer::Token { - self.token.as_ref().unwrap() - } - - fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token { - self.token.as_mut().unwrap() - } -} - -#[derive(Logos, Debug, PartialEq)] -#[logos(skip r"[ \t\n\f]+")] -enum Token { - #[regex("[\\w|\\p{Han}|\\p{Hiragana}|\\p{Katakana}|\\p{Cyrillic}|\\p{Arabic}]+")] - Text, -} - -#[derive(Clone)] -pub struct Simple; - -pub struct SimpleTokenStream<'a> { - lexer: Lexer<'a, Token>, - token: Option, - next_position: usize, -} - -impl tantivy::tokenizer::Tokenizer for Simple { - type TokenStream<'a> = BoxTokenStream<'a>; - - fn token_stream<'a>(&mut self, text: &'a str) -> Self::TokenStream<'a> { - let lexer = Token::lexer(text); - BoxTokenStream::new(SimpleTokenStream { - lexer, - token: None, - next_position: 0, - }) - } -} - -impl<'a> tantivy::tokenizer::TokenStream for SimpleTokenStream<'a> { - fn advance(&mut self) -> bool { - self.token = self.lexer.next().map(|_| { - let span = self.lexer.span(); - let pos = self.next_position; - self.next_position += 1; - tantivy::tokenizer::Token { - offset_from: span.start, - offset_to: span.end, - position: pos, - text: self.lexer.slice().to_string(), - ..Default::default() - } - }); - - self.token.is_some() - } - - fn token(&self) -> &tantivy::tokenizer::Token { - self.token.as_ref().unwrap() - } - - fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token { - self.token.as_mut().unwrap() - } -} - -pub struct NGramTokenStream<'a, const N: usize> { - inner: BoxTokenStream<'a>, - token: tantivy::tokenizer::Token, - token_window: [tantivy::tokenizer::Token; N], - next_pos: usize, -} - -impl<'a, const N: usize> NGramTokenStream<'a, N> { - pub fn new(inner: BoxTokenStream<'a>) -> Self { - Self { - inner, - token: tantivy::tokenizer::Token::default(), - token_window: array::from_fn(|_| tantivy::tokenizer::Token::default()), - next_pos: 0, - } - } -} - -fn reuse_token_alloc(token: &mut tantivy::tokenizer::Token, new_token: &tantivy::tokenizer::Token) { - token.text.clear(); - token.text += new_token.text.as_str(); - token.offset_from = new_token.offset_from; - token.offset_to = new_token.offset_to; - token.position = new_token.position; - token.position_length = new_token.position_length; -} - -impl<'a, const N: usize> tantivy::tokenizer::TokenStream for NGramTokenStream<'a, N> { - fn advance(&mut self) -> bool { - if !self.inner.advance() { - return false; - } - - self.token_window.rotate_left(1); - reuse_token_alloc(&mut self.token_window[N - 1], self.inner.token()); +pub use fields::FieldTokenizer; - while self.token_window[0].text.is_empty() { - if !self.inner.advance() { - return false; - } - - self.token_window.rotate_left(1); - reuse_token_alloc(&mut self.token_window[N - 1], self.inner.token()); - } - - self.next_pos += 1; - - let begin = self - .token_window - .iter() - .position(|token| !token.text.is_empty()) - .unwrap_or(N - 1); - - self.token.position = self.next_pos; - self.token.offset_from = self.token_window[begin].offset_from; - self.token.offset_to = self.token_window[N - 1].offset_to; - self.token.position_length = N - begin; - - self.token.text.clear(); - for token in &self.token_window { - self.token.text += token.text.as_str(); - } - - true - } - - fn token(&self) -> &tantivy::tokenizer::Token { - &self.token - } - - fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token { - &mut self.token - } -} - -pub struct FlattenedJson { - flattened_json: String, - inner_tokenizer: JsonField, -} - -struct IntermediateFlatValue { - parent_keys: Vec, - val: serde_json::Value, -} - -fn flatten(val: serde_json::Value) -> Vec { - let mut res = Vec::new(); - - let mut stack = Vec::new(); - stack.push(IntermediateFlatValue { - parent_keys: Vec::new(), - val, - }); - - while let Some(elem) = stack.pop() { - match elem.val { - serde_json::Value::Null => { - res.push(itertools::intersperse(elem.parent_keys, ".".to_string()).collect()) - } - serde_json::Value::Bool(b) => { - let key: String = - itertools::intersperse(elem.parent_keys, ".".to_string()).collect(); - res.push(format!("{key}=\"{b}\"")) - } - serde_json::Value::Number(n) => { - let key: String = - itertools::intersperse(elem.parent_keys, ".".to_string()).collect(); - res.push(format!("{key}=\"{n}\"")) - } - serde_json::Value::String(s) => { - let key: String = - itertools::intersperse(elem.parent_keys, ".".to_string()).collect(); - res.push(format!("{key}=\"{}\"", s.replace('"', "\\\""))) - } - serde_json::Value::Array(arr) => { - for item in arr { - stack.push(IntermediateFlatValue { - parent_keys: elem.parent_keys.clone(), - val: item, - }); - } - } - serde_json::Value::Object(map) => { - for (key, val) in map { - let mut parent_keys = elem.parent_keys.clone(); - parent_keys.push(key); +use self::segmenter::Segmenter; - stack.push(IntermediateFlatValue { parent_keys, val }); - } - } - } - } - - res.reverse(); - - res +#[derive(Debug)] +pub struct Token { + text: String, + span: std::ops::Range, } -impl FlattenedJson { - pub fn new(value: &T) -> crate::Result - where - T: serde::Serialize, - { - let json = serde_json::to_string(value)?; - let val: serde_json::Value = serde_json::from_str(&json)?; - - let flattened_json = itertools::intersperse(flatten(val), "\n".to_string()).collect(); - - Ok(Self { - flattened_json, - inner_tokenizer: JsonField, - }) - } - - pub fn token_stream(&mut self) -> BoxTokenStream { - tantivy::tokenizer::Tokenizer::token_stream(&mut self.inner_tokenizer, &self.flattened_json) +impl Token { + pub fn new(text: String, span: std::ops::Range) -> Self { + Token { text, span } } pub fn text(&self) -> &str { - &self.flattened_json - } -} - -#[derive(Clone, Debug)] -pub struct JsonField; - -impl JsonField { - pub fn as_str() -> &'static str { - "json_tokenizer" - } -} - -impl tantivy::tokenizer::Tokenizer for JsonField { - type TokenStream<'a> = BoxTokenStream<'a>; - - fn token_stream<'a>(&mut self, text: &'a str) -> Self::TokenStream<'a> { - BoxTokenStream::new(JsonFieldTokenStream { - text, - chars: text.char_indices(), - token: tantivy::tokenizer::Token::default(), - }) + &self.text } -} - -pub struct JsonFieldTokenStream<'a> { - text: &'a str, - chars: CharIndices<'a>, - token: tantivy::tokenizer::Token, -} - -impl<'a> JsonFieldTokenStream<'a> { - // search for the end of the current token. - fn search_token_end(&mut self, is_quote: bool) -> usize { - let mut escaped = false; - for (offset, c) in self.chars.by_ref() { - if is_quote { - if c == '\\' { - escaped = true; - } else { - if c == '"' && !escaped { - return offset; - } - - escaped = false; - } - } else if !c.is_alphanumeric() { - return offset; - } - } - - self.text.len() - } -} - -impl<'a> tantivy::tokenizer::TokenStream for JsonFieldTokenStream<'a> { - fn advance(&mut self) -> bool { - self.token.text.clear(); - self.token.position = self.token.position.wrapping_add(1); - let mut prev_was_quote = false; - - while let Some((offset_from, c)) = self.chars.next() { - if !matches!(c, '.' | '\n' | '"') { - let offset_to = self.search_token_end(prev_was_quote); - self.token.offset_from = offset_from; - self.token.offset_to = offset_to; - - if prev_was_quote { - self.token.offset_from -= 1; - self.token.offset_to += 1; - - self.token.offset_from = floor_char_boundary(self.text, self.token.offset_from); - self.token.offset_to = - ceil_char_boundary(self.text, self.token.offset_to).min(self.text.len()); - } - - self.token - .text - .push_str(&self.text[self.token.offset_from..self.token.offset_to]); - return true; - } - - prev_was_quote = c == '"'; - } - false - } - - fn token(&self) -> &tantivy::tokenizer::Token { - &self.token - } - - fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token { - &mut self.token - } -} - -#[derive(Clone, Default)] -struct ParsedUrl { - protocol: Option>, - domain: Option>, - path: VecDeque, -} - -#[derive(Debug, Clone)] -pub struct UrlTokenizer; -impl UrlTokenizer { - pub fn as_str() -> &'static str { - "url_tokenizer" + pub fn span(&self) -> std::ops::Range { + self.span.clone() } - fn parse_url(text: &str) -> ParsedUrl { - url::Url::parse(text) - .or_else(|_| url::Url::parse(&format!("http://{}", text))) - .map(|url| { - let domain = Some( - url.host_str() - .unwrap_or("") - .split_preserve(|c| matches!(c, '.')) - .filter(|s| !(*s).is_empty()) - .map(|s| s.to_string()) - .add_space_last() - .collect(), - ); - let path: VecDeque<_> = url - .path() - .split_preserve(|c| matches!(c, '/' | '-' | '_')) - .filter(|s| !(*s).is_empty()) - .map(|s| s.to_string()) - .collect(); - - if matches!(url.scheme(), "http" | "https") { - ParsedUrl { - protocol: None, - domain, - path, - } - } else { - let mut v = VecDeque::new(); - v.push_back(url.scheme().to_string()); - - ParsedUrl { - protocol: Some(v), - domain, - path, - } - } - }) - .unwrap_or_default() - } -} - -impl tantivy::tokenizer::Tokenizer for UrlTokenizer { - type TokenStream<'a> = BoxTokenStream<'a>; - - fn token_stream<'a>(&mut self, text: &'a str) -> Self::TokenStream<'a> { - debug_assert_eq!(text.chars().filter(|c| *c == ' ').count(), 0); - - let urls = text - .split('\n') - .filter(|s| !s.is_empty()) - .map(|s| s.to_lowercase()) - .map(|s| Self::parse_url(&s)) - .collect(); - - BoxTokenStream::new(SiteOperatorUrlTokenStream::new(urls)) + pub fn offset(&mut self, offset: usize) { + self.span = self.span.start + offset..self.span.end + offset; } } -pub struct SiteOperatorUrlTokenStream { - urls: VecDeque, - current_url: ParsedUrl, - token: tantivy::tokenizer::Token, -} - -impl SiteOperatorUrlTokenStream { - fn new(mut urls: VecDeque) -> Self { - let current_url = urls.pop_front().unwrap_or_default(); - - Self { - urls, - current_url, - token: tantivy::tokenizer::Token::default(), - } - } - - fn advance_current_url(&mut self) -> bool { - if let Some(protocol) = self.current_url.protocol.as_mut() { - self.token.position = self.token.position.wrapping_add(1); - self.token.text.clear(); - - if let Some(s) = protocol.pop_front() { - self.token.text.push_str(&s); - self.token.offset_from = 0; - self.token.offset_to = s.len(); - } else { - self.token.offset_from = self.token.offset_to; - self.token.text.push_str("://"); - self.token.offset_to += self.token.text.len(); - - self.current_url.protocol = None; - } - - return true; - } - - if let Some(domain) = self.current_url.domain.as_mut() { - if let Some(s) = domain.pop_front() { - self.token.text.clear(); - self.token.position = self.token.position.wrapping_add(1); - - self.token.text.push_str(&s); - - self.token.offset_from = self.token.offset_to; - self.token.offset_to += self.token.text.len(); - return true; - } - } - - if let Some(s) = self.current_url.path.pop_front() { - self.token.text.clear(); - self.token.position = self.token.position.wrapping_add(1); - - self.token.text.push_str(&s); - self.token.offset_from = self.token.offset_to; - self.token.offset_to += self.token.text.len(); - - return true; - } - - false - } - - fn next_url(&mut self) -> bool { - if let Some(url) = self.urls.pop_front() { - self.current_url = url; - - self.token.position = self.token.position.wrapping_add(1); - self.token.text.clear(); - self.token.text.push('\n'); - - self.token.offset_from = self.token.offset_to; - self.token.offset_to += self.token.text.len(); - - true - } else { - false - } - } +pub trait Tokenize { + fn tokenize(&self) -> impl Iterator + '_; } -impl tantivy::tokenizer::TokenStream for SiteOperatorUrlTokenStream { - fn advance(&mut self) -> bool { - if self.advance_current_url() { - return true; - } - - self.next_url() - } - - fn token(&self) -> &tantivy::tokenizer::Token { - &self.token - } - - fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token { - &mut self.token +impl Tokenize for str { + fn tokenize(&self) -> impl Iterator + '_ { + self.segments().flat_map(|segment| segment.tokenize()) } } #[cfg(test)] mod tests { - use tantivy::tokenizer::Tokenizer as _; - use super::*; - - fn tokenize_simple(s: &str) -> Vec { - let mut res = Vec::new(); - let mut tokenizer = Normal::default(); - let mut stream = tokenizer.token_stream(s); - - while let Some(token) = stream.next() { - res.push(token.text.clone()); - } - - res - } - - fn tokenize_json(s: &str) -> Vec { - let mut res = Vec::new(); - let mut tokenizer = JsonField; - let mut stream = tokenizer.token_stream(s); - - while let Some(token) = stream.next() { - res.push(token.text.clone()); - } - - res - } - - fn tokenize_bigram(s: &str) -> Vec { - let mut res = Vec::new(); - let mut tokenizer = Tokenizer::Bigram(BigramTokenizer::default()); - let mut stream = tokenizer.token_stream(s); - - while let Some(token) = stream.next() { - res.push(token.text.clone()); - } - - res - } - - fn tokenize_trigram(s: &str) -> Vec { - let mut res = Vec::new(); - - let mut tokenizer = Tokenizer::Trigram(TrigramTokenizer::default()); - let mut stream = tokenizer.token_stream(s); - - while let Some(token) = stream.next() { - res.push(token.text.clone()); - } - - res - } - - fn tokenize_url(s: &str) -> Vec { - let mut res = Vec::new(); - let mut tokenizer = UrlTokenizer; - let mut stream = tokenizer.token_stream(s); - - while let Some(token) = stream.next() { - res.push(token.text.clone()); - } - - res - } - - fn tokenize_identity(s: &str) -> Vec { - let mut res = Vec::new(); - let mut tokenizer = Identity {}; - let mut stream = tokenizer.token_stream(s); - - while let Some(token) = stream.next() { - res.push(token.text.clone()); - } - - res - } - - #[test] - fn simple_tokenization() { - assert_eq!( - tokenize_simple("this is a relatively simple123 test string"), - vec![ - "this", - "is", - "a", - "relatively", - "simple123", - "test", - "string" - ] - ); - } - - #[test] - fn out_of_bounds_crash() { - tokenize_json( - r#" -Breadcrumb.title="Home" -Breadcrumb.url="https://www.eurotecnicaservice.it/?lang=en" -Breadcrumb.title="Fuser Pur" -Breadcrumb.url="https://www.eurotecnicaservice.it/testing\" -"#, - ); - } - - #[test] - fn special_character_tokenization() { - assert_eq!(tokenize_simple("example.com"), vec!["example", ".", "com",]); - assert_eq!( - tokenize_simple("example. com"), - vec!["example", ".", "com",] - ); - assert_eq!( - tokenize_simple("example . com"), - vec!["example", ".", "com",] - ); - - assert_eq!( - tokenize_simple("a c++ blog post"), - vec!["a", "c", "+", "+", "blog", "post"] - ); - assert_eq!(tokenize_simple("path/test"), vec!["path", "/", "test",]); - } + use proptest::prelude::*; #[test] - fn tokenize_json_field() { - assert_eq!( - tokenize_json(r#"Test.field="value""#), - vec!["Test", "field", "\"value\"",] - ); - assert_eq!( - tokenize_json(r#"Test.field="this is the value""#), - vec!["Test", "field", "\"this is the value\"",] - ); - assert_eq!( - tokenize_json(r#"Test.field="this is\" the value""#), - vec!["Test", "field", "\"this is\\\" the value\"",] - ); - assert_eq!( - tokenize_json("Test.field=\"this*@# is\\\" the\\\" \nvalue\""), - vec!["Test", "field", "\"this*@# is\\\" the\\\" \nvalue\"",] - ); - } - - fn flattened_json_helper(json: &str, expected: &str) { - let parsed: serde_json::Value = serde_json::from_str(json).unwrap(); - let flat = &FlattenedJson::new(&parsed).unwrap().flattened_json; - - assert_eq!(flat, expected); - } - - #[test] - fn flatten_json_object() { - let json = r#" - { - "key1": "val1", - "key2": "val2" - } - "#; - let expected = r#"key1="val1" -key2="val2""#; - - flattened_json_helper(json, expected); - - let json = r#" - { - "key1": 1, - "key2": 2 - } - "#; - let expected = r#"key1="1" -key2="2""#; - - flattened_json_helper(json, expected); - - let json = r#" - { - "key1": { - "key2": "value1", - "key3": "value2" - } - } - "#; - let expected = r#"key1.key2="value1" -key1.key3="value2""#; - - flattened_json_helper(json, expected); - - let json = r#" - { - "$key1": { - "$key2": "value1", - "key3": "value2" + fn test_tokenizer() { + let input = "Hello, world! This is a test."; + + let tokens: Vec<_> = input.tokenize().collect(); + assert_eq!(tokens.len(), 9); + + assert_eq!(tokens[0].text(), "Hello"); + assert_eq!(tokens[1].text(), ","); + assert_eq!(tokens[2].text(), "world"); + assert_eq!(tokens[3].text(), "!"); + assert_eq!(tokens[4].text(), "This"); + assert_eq!(tokens[5].text(), "is"); + assert_eq!(tokens[6].text(), "a"); + assert_eq!(tokens[7].text(), "test"); + assert_eq!(tokens[8].text(), "."); + } + + proptest! { + #[test] + fn prop_tokenizer_correct_span(txt: String) { + let tokens: Vec<_> = txt.tokenize().collect(); + for token in tokens { + assert_eq!(&txt[token.span()], token.text()); } } - "#; - let expected = r#"$key1.$key2="value1" -$key1.key3="value2""#; - - flattened_json_helper(json, expected); - - let json = r#" - { - "key1": [ - "value1", - "value2" - ] - } - "#; - let expected = r#"key1="value1" -key1="value2""#; - - flattened_json_helper(json, expected); - - let json = r#" - { - "key1": [ - "value1", - { - "key2": "value2", - "key3": 123 - } - ] - } - "#; - let expected = r#"key1="value1" -key1.key2="value2" -key1.key3="123""#; - - flattened_json_helper(json, expected); - - let json = r#" - { - "key1": [ - "value1", - { - "key2": "this\" is @ a # test" - } - ] - } - "#; - let expected = r#"key1="value1" -key1.key2="this\" is @ a # test""#; - - flattened_json_helper(json, expected); - } - - #[test] - fn bigram_tokenizer() { - assert!(tokenize_bigram("").is_empty()); - assert!(tokenize_bigram("test").is_empty()); - - assert_eq!(tokenize_bigram("this is"), vec!["thisis"]); - assert_eq!(tokenize_bigram("this is a"), vec!["thisis", "isa",]); - assert_eq!( - tokenize_bigram("this is a test"), - vec!["thisis", "isa", "atest",] - ); - - // '.' is a stopword - assert_eq!(tokenize_bigram("this.is"), vec!["thisis"]); - } - - #[test] - fn trigram_tokenizer() { - assert!(tokenize_trigram("").is_empty()); - assert!(tokenize_trigram("test").is_empty()); - assert!(tokenize_trigram("this is").is_empty()); - - assert_eq!(tokenize_trigram("this is a"), vec!["thisisa",]); - assert_eq!( - tokenize_trigram("this is a test"), - vec!["thisisa", "isatest",] - ); - } - - #[test] - fn han() { - assert_eq!( - tokenize_simple("test 漢.com"), - vec!["test", "漢", ".", "com"] - ); - } - - #[test] - fn hiragana() { - assert_eq!( - tokenize_simple("test あ.com"), - vec!["test", "あ", ".", "com"] - ); - } - - #[test] - fn katakana() { - assert_eq!( - tokenize_simple("test ダ.com"), - vec!["test", "ダ", ".", "com"] - ); - } - - #[test] - fn cyrillic() { - assert_eq!(tokenize_simple("test б.com"), vec!["test", "б", ".", "com"]); - } - - #[test] - fn arabic() { - assert_eq!(tokenize_simple("test ب.com"), vec!["test", "ب", ".", "com"]); - } - - #[test] - fn url() { - assert_eq!( - tokenize_url("https://www.example.com"), - vec!["www", ".", "example", ".", "com ",] - ); - - assert_eq!( - tokenize_url("https://www.example.com/test"), - vec!["www", ".", "example", ".", "com ", "/", "test",] - ); - - assert_eq!(tokenize_url("example.com"), vec!["example", ".", "com ",]); - - assert_eq!( - tokenize_url("example.com/another/path"), - vec!["example", ".", "com ", "/", "another", "/", "path",] - ); - - assert_eq!(tokenize_url(".com"), vec![".", "com ",]) - } - - #[test] - fn multiple_urls() { - assert_eq!( - tokenize_url("https://www.example.com\nhttps://www.example.com"), - vec!["www", ".", "example", ".", "com ", "\n", "www", ".", "example", ".", "com ",] - ); - - assert_eq!( - tokenize_url("https://www.example.com/test\nhttps://www.abcd.com"), - vec![ - "www", ".", "example", ".", "com ", "/", "test", "\n", "www", ".", "abcd", ".", - "com ", - ] - ); - - assert_eq!( - tokenize_url("https://example.com/test\nhttps://www.abcd.com/test"), - vec![ - "example", ".", "com ", "/", "test", "\n", "www", ".", "abcd", ".", "com ", "/", - "test", - ] - ); - } - - #[test] - fn identity() { - assert_eq!(tokenize_identity("this is a test"), vec!["this is a test"]); - assert_eq!(tokenize_identity("a-b"), vec!["a-b"]); } } diff --git a/crates/core/src/tokenizer/script.rs b/crates/core/src/tokenizer/script.rs new file mode 100644 index 00000000..60f762b4 --- /dev/null +++ b/crates/core/src/tokenizer/script.rs @@ -0,0 +1,44 @@ +// Stract is an open source web search engine. +// Copyright (C) 2024 Stract ApS +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use super::script_tokenizer::ScriptTokenizer; + +#[derive(Debug, PartialEq, Default, Clone, Copy)] +pub enum Script { + Latin, + + #[default] + Other, +} + +impl From for Script { + fn from(c: char) -> Self { + if c.is_ascii() { + Script::Latin + } else { + Script::Other + } + } +} + +impl Script { + pub fn tokenizer(self) -> Box { + match self { + Script::Latin => Box::new(super::script_tokenizer::Latin), + Script::Other => Box::new(super::script_tokenizer::Latin), + } + } +} diff --git a/crates/core/src/tokenizer/script_tokenizer.rs b/crates/core/src/tokenizer/script_tokenizer.rs new file mode 100644 index 00000000..00e2f756 --- /dev/null +++ b/crates/core/src/tokenizer/script_tokenizer.rs @@ -0,0 +1,73 @@ +// Stract is an open source web search engine. +// Copyright (C) 2024 Stract ApS +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use super::{ + split_preserve::StrSplitPreserveWithRange, + split_whitespace_with_range::SplitWhitespaceWithRange, Token, +}; + +pub trait ScriptTokenizer { + fn tokenize<'a>(&self, text: &'a str) -> Box + 'a>; +} + +pub struct Latin; + +impl ScriptTokenizer for Latin { + fn tokenize<'a>(&self, text: &'a str) -> Box + 'a> { + Box::new( + text.split_whitespace_with_range() + .flat_map(|(txt, span)| { + let offset = span.start; + txt.split_preserve_with_range(|c| !c.is_alphabetic() && !c.is_numeric()) + .map(move |(txt, span)| { + let span = offset + span.start..offset + span.end; + (txt, span) + }) + }) + .map(|(txt, span)| Token::new(txt.to_string(), span)), + ) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use proptest::prelude::*; + + #[test] + fn test_latin() { + let tokenizer = Latin; + let txt = "Hello, world! 123"; + let tokens: Vec<_> = tokenizer.tokenize(txt).collect(); + assert_eq!(tokens.len(), 5); + assert_eq!(tokens[0].text(), "Hello"); + assert_eq!(tokens[1].text(), ","); + assert_eq!(tokens[2].text(), "world"); + assert_eq!(tokens[3].text(), "!"); + assert_eq!(tokens[4].text(), "123"); + } + + proptest! { + #[test] + fn prop_latin_correct_span(txt: String) { + let tokenizer = Latin; + let tokens: Vec<_> = tokenizer.tokenize(&txt).collect(); + for token in tokens { + assert_eq!(&txt[token.span()], token.text()); + } + } + } +} diff --git a/crates/core/src/tokenizer/segmenter.rs b/crates/core/src/tokenizer/segmenter.rs new file mode 100644 index 00000000..c7a91eb3 --- /dev/null +++ b/crates/core/src/tokenizer/segmenter.rs @@ -0,0 +1,156 @@ +// Stract is an open source web search engine. +// Copyright (C) 2024 Stract ApS +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use super::{script::Script, Token}; + +/// A segment is a part of a text where the entire segment has the same script and langage. +#[derive(Clone)] +pub struct Segment<'a> { + full_text: &'a str, + span: std::ops::Range, + script: Script, +} + +impl<'a> Segment<'a> { + pub fn script(&self) -> Script { + self.script + } + + pub fn text(&self) -> &'a str { + &self.full_text[self.span.clone()] + } + + pub fn span(&self) -> std::ops::Range { + self.span.clone() + } + + pub fn tokenize(&self) -> impl Iterator + 'a { + let offset = self.span.start; + let script = self.script; + + script + .tokenizer() + .tokenize(self.text()) + .map(move |mut token| { + token.offset(offset); + token + }) + } +} + +pub trait Segmenter { + fn segments(&self) -> SegmentIterator; +} + +impl Segmenter for str { + fn segments(&self) -> SegmentIterator<'_> { + SegmentIterator::new(self) + } +} + +impl Segmenter for String { + fn segments(&self) -> SegmentIterator<'_> { + SegmentIterator::new(self) + } +} + +pub struct SegmentIterator<'a> { + prev_end: usize, + input: &'a str, +} + +impl<'a> SegmentIterator<'a> { + pub fn new(input: &'a str) -> Self { + Self { input, prev_end: 0 } + } +} + +impl<'a> Iterator for SegmentIterator<'a> { + type Item = Segment<'a>; + + fn next(&mut self) -> Option { + if self.prev_end >= self.input.len() { + return None; + } + + let start = self.prev_end; + let mut end = start; + let mut script = None; + + while end < self.input.len() { + let c = self.input[end..].chars().next().unwrap(); + let next_script = Script::from(c); + + if let Some(script) = &script { + if &next_script != script { + break; + } + } else { + script = Some(next_script); + } + + end += c.len_utf8(); + } + + self.prev_end = end; + + Some(Segment { + script: script.unwrap_or_default(), + full_text: self.input, + span: start..end, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use proptest::prelude::*; + + #[test] + fn test_segments() { + let txt = "Hello, world! This is a test."; + let segments: Vec<_> = txt.segments().collect(); + assert_eq!(segments.len(), 1); + assert_eq!(segments[0].text(), txt); + assert_eq!(segments[0].script, Script::Latin); + + let txt = "こんにちは、世界!"; + let segments: Vec<_> = txt.segments().collect(); + + assert_eq!(segments.len(), 1); + assert_eq!(segments[0].text(), txt); + assert_eq!(segments[0].script, Script::Other); + + let txt = "Hello, こんにちは、世界!"; + let segments: Vec<_> = txt.segments().collect(); + + assert_eq!(segments.len(), 2); + assert_eq!(segments[0].text(), "Hello, "); + assert_eq!(segments[0].script, Script::Latin); + assert_eq!(segments[1].text(), "こんにちは、世界!"); + assert_eq!(segments[1].script, Script::Other); + } + + proptest! { + #[test] + fn proptest_byte_offsets(txt in ".*") { + for segment in txt.segments() { + assert!(!segment.text().is_empty()); + } + } + } +} diff --git a/crates/core/src/tokenizer/split_preserve.rs b/crates/core/src/tokenizer/split_preserve.rs index f0bb4afc..e6506610 100644 --- a/crates/core/src/tokenizer/split_preserve.rs +++ b/crates/core/src/tokenizer/split_preserve.rs @@ -14,7 +14,7 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -pub struct SplitPreserve<'a, P> +pub struct SplitPreserveWithRange<'a, P> where P: Fn(char) -> bool, { @@ -24,7 +24,7 @@ where last_pred: Option, } -impl<'a, P> SplitPreserve<'a, P> +impl<'a, P> SplitPreserveWithRange<'a, P> where P: Fn(char) -> bool, { @@ -38,45 +38,81 @@ where } } -impl<'a, P> Iterator for SplitPreserve<'a, P> +impl<'a, P> Iterator for SplitPreserveWithRange<'a, P> where P: Fn(char) -> bool, { - type Item = &'a str; + type Item = (&'a str, std::ops::Range); fn next(&mut self) -> Option { - if self.start >= self.s.len() { - return None; + if let Some(c) = self.last_pred.take() { + let range = self.start - c.len_utf8()..self.start; + return Some((&self.s[range.clone()], range)); } - if let Some(c) = self.last_pred.take() { - return Some(&self.s[self.start - c.len_utf8()..self.start]); + if self.start >= self.s.len() { + return None; } for (i, c) in self.s[self.start..].char_indices() { if (self.pred)(c) { - self.last_pred = Some(c); + let range = self.start..self.start + i; + let res = &self.s[range.clone()]; + self.start += i + c.len_utf8(); - let res = &self.s[self.start..self.start + i]; + if i == 0 { + let range = self.start - c.len_utf8()..self.start; + return Some((&self.s[range.clone()], range)); + } - self.start += i + c.len_utf8(); + self.last_pred = Some(c); - return Some(res); + return Some((res, range)); } } if self.start < self.s.len() { - let res = &self.s[self.start..]; + let range = self.start..self.s.len(); + let res = &self.s[range.clone()]; self.start = self.s.len(); - Some(res) + Some((res, range)) } else { None } } } +pub struct SplitPreserve<'a, P> +where + P: Fn(char) -> bool, +{ + inner: SplitPreserveWithRange<'a, P>, +} + +impl<'a, P> SplitPreserve<'a, P> +where + P: Fn(char) -> bool, +{ + fn new(s: &'a str, pred: P) -> Self { + Self { + inner: SplitPreserveWithRange::new(s, pred), + } + } +} + +impl<'a, P> Iterator for SplitPreserve<'a, P> +where + P: Fn(char) -> bool, +{ + type Item = &'a str; + + fn next(&mut self) -> Option { + self.inner.next().map(|(s, _)| s) + } +} + pub trait StrSplitPreserve { fn split_preserve(&self, pred: F) -> SplitPreserve where @@ -101,6 +137,30 @@ impl StrSplitPreserve for String { } } +pub trait StrSplitPreserveWithRange { + fn split_preserve_with_range(&self, pred: F) -> SplitPreserveWithRange + where + F: Fn(char) -> bool; +} + +impl StrSplitPreserveWithRange for str { + fn split_preserve_with_range(&self, pred: F) -> SplitPreserveWithRange + where + F: Fn(char) -> bool, + { + SplitPreserveWithRange::new(self, pred) + } +} + +impl StrSplitPreserveWithRange for String { + fn split_preserve_with_range(&self, pred: F) -> SplitPreserveWithRange + where + F: Fn(char) -> bool, + { + SplitPreserveWithRange::new(self, pred) + } +} + #[cfg(test)] mod tests { use super::*; @@ -115,4 +175,50 @@ mod tests { let res = "hello".split_preserve(|c| c == '.').collect::>(); assert_eq!(res, vec!["hello"]); } + + #[test] + fn test_starts_with() { + let res = ".hello.brave.new.world" + .split_preserve(|c| c == '.') + .collect::>(); + assert_eq!( + res, + vec![".", "hello", ".", "brave", ".", "new", ".", "world"] + ); + } + + #[test] + fn test_ends_with() { + let res = "hello.brave.new.world." + .split_preserve(|c| c == '.') + .collect::>(); + assert_eq!( + res, + vec!["hello", ".", "brave", ".", "new", ".", "world", "."] + ); + } + + #[test] + fn test_empty() { + let res = "".split_preserve(|c| c == '.').collect::>(); + assert_eq!(res, vec![] as Vec<&str>); + } + + #[test] + fn test_no_split() { + let res = "hello".split_preserve(|c| c == '.').collect::>(); + assert_eq!(res, vec!["hello"]); + } + + #[test] + fn test_single_char() { + let res = ".".split_preserve(|c| c == '.').collect::>(); + assert_eq!(res, vec!["."]); + } + + #[test] + fn test_multi_char() { + let res = "....".split_preserve(|c| c == '.').collect::>(); + assert_eq!(res, vec![".", ".", ".", "."]); + } } diff --git a/crates/core/src/tokenizer/split_whitespace_with_range.rs b/crates/core/src/tokenizer/split_whitespace_with_range.rs new file mode 100644 index 00000000..001aeafe --- /dev/null +++ b/crates/core/src/tokenizer/split_whitespace_with_range.rs @@ -0,0 +1,112 @@ +// Stract is an open source web search engine. +// Copyright (C) 2024 Stract ApS +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +pub trait SplitWhitespaceWithRange { + fn split_whitespace_with_range(&self) -> SplitWhitespaceWithRangeIter; +} + +pub struct SplitWhitespaceWithRangeIter<'a> { + s: &'a str, + start: usize, +} + +impl<'a> SplitWhitespaceWithRangeIter<'a> { + fn new(s: &'a str) -> Self { + Self { s, start: 0 } + } +} + +impl<'a> Iterator for SplitWhitespaceWithRangeIter<'a> { + type Item = (&'a str, std::ops::Range); + + fn next(&mut self) -> Option { + for c in self.s[self.start..].chars() { + if !c.is_whitespace() { + break; + } + self.start += c.len_utf8(); + } + + if self.start >= self.s.len() { + return None; + } + + let start = self.s[self.start..].find(|c: char| !c.is_whitespace())?; + let start = self.start + start; + let end = self.s[start..] + .find(char::is_whitespace) + .map(|end| start + end) + .unwrap_or(self.s.len()); + let range = start..end; + self.start = end; + Some((&self.s[range.clone()], range)) + } +} + +impl SplitWhitespaceWithRange for str { + fn split_whitespace_with_range(&self) -> SplitWhitespaceWithRangeIter { + SplitWhitespaceWithRangeIter::new(self) + } +} + +impl SplitWhitespaceWithRange for String { + fn split_whitespace_with_range(&self) -> SplitWhitespaceWithRangeIter { + SplitWhitespaceWithRangeIter::new(self) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use proptest::prelude::*; + + #[test] + fn test_split_whitespace_with_range() { + let txt = "Hello, world! 123"; + let tokens: Vec<_> = txt.split_whitespace_with_range().collect(); + assert_eq!(tokens.len(), 3); + assert_eq!(tokens[0], ("Hello,", 0..6)); + assert_eq!(tokens[1], ("world!", 7..13)); + assert_eq!(tokens[2], ("123", 14..17)); + } + + #[test] + fn test_split_whitespace_with_range_empty() { + let txt = ""; + let tokens: Vec<_> = txt.split_whitespace_with_range().collect(); + assert_eq!(tokens.len(), 0); + } + + #[test] + fn test_multi_whitespace() { + let txt = "Hello, world! 123"; + let tokens: Vec<_> = txt.split_whitespace_with_range().collect(); + assert_eq!(tokens.len(), 3); + assert_eq!(tokens[0], ("Hello,", 0..6)); + assert_eq!(tokens[1], ("world!", 9..15)); + assert_eq!(tokens[2], ("123", 16..19)); + } + + proptest! { + #[test] + fn prop_split_whitespace_with_range(s: String) { + let tokens: Vec<_> = s.split_whitespace_with_range().collect(); + for (txt, range) in tokens { + assert_eq!(&s[range.clone()], txt); + } + } + } +} diff --git a/crates/core/src/tokenizer/stemmer.rs b/crates/core/src/tokenizer/stemmer.rs new file mode 100644 index 00000000..58608625 --- /dev/null +++ b/crates/core/src/tokenizer/stemmer.rs @@ -0,0 +1,80 @@ +// Stract is an open source web search engine. +// Copyright (C) 2024 Stract ApS +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use whatlang::Lang; + +pub struct Stemmer(tantivy::tokenizer::Stemmer); + +impl Stemmer { + pub fn into_tantivy(self) -> tantivy::tokenizer::Stemmer { + self.0 + } +} + +impl From for Stemmer { + fn from(lang: Lang) -> Self { + match lang { + Lang::Dan => Stemmer(tantivy::tokenizer::Stemmer::new( + tantivy::tokenizer::Language::Danish, + )), + Lang::Ara => Stemmer(tantivy::tokenizer::Stemmer::new( + tantivy::tokenizer::Language::Arabic, + )), + Lang::Nld => Stemmer(tantivy::tokenizer::Stemmer::new( + tantivy::tokenizer::Language::Dutch, + )), + Lang::Fin => Stemmer(tantivy::tokenizer::Stemmer::new( + tantivy::tokenizer::Language::Finnish, + )), + Lang::Fra => Stemmer(tantivy::tokenizer::Stemmer::new( + tantivy::tokenizer::Language::French, + )), + Lang::Deu => Stemmer(tantivy::tokenizer::Stemmer::new( + tantivy::tokenizer::Language::German, + )), + Lang::Hun => Stemmer(tantivy::tokenizer::Stemmer::new( + tantivy::tokenizer::Language::Hungarian, + )), + Lang::Ita => Stemmer(tantivy::tokenizer::Stemmer::new( + tantivy::tokenizer::Language::Italian, + )), + Lang::Por => Stemmer(tantivy::tokenizer::Stemmer::new( + tantivy::tokenizer::Language::Portuguese, + )), + Lang::Ron => Stemmer(tantivy::tokenizer::Stemmer::new( + tantivy::tokenizer::Language::Romanian, + )), + Lang::Rus => Stemmer(tantivy::tokenizer::Stemmer::new( + tantivy::tokenizer::Language::Russian, + )), + Lang::Spa => Stemmer(tantivy::tokenizer::Stemmer::new( + tantivy::tokenizer::Language::Spanish, + )), + Lang::Swe => Stemmer(tantivy::tokenizer::Stemmer::new( + tantivy::tokenizer::Language::Swedish, + )), + Lang::Tam => Stemmer(tantivy::tokenizer::Stemmer::new( + tantivy::tokenizer::Language::Tamil, + )), + Lang::Tur => Stemmer(tantivy::tokenizer::Stemmer::new( + tantivy::tokenizer::Language::Turkish, + )), + _ => Stemmer(tantivy::tokenizer::Stemmer::new( + tantivy::tokenizer::Language::English, + )), + } + } +} diff --git a/crates/core/src/warc.rs b/crates/core/src/warc.rs index 9f4f721f..71b09997 100644 --- a/crates/core/src/warc.rs +++ b/crates/core/src/warc.rs @@ -30,7 +30,7 @@ use flate2::write::GzEncoder; use flate2::Compression; use fnv::FnvHashSet; #[cfg(test)] -use proptest_derive::Arbitrary; +use proptest::prelude::*; use tracing::{debug, trace}; @@ -208,15 +208,31 @@ struct RawWarcRecord { } #[derive(Debug)] -#[cfg_attr(test, derive(Clone, Arbitrary, PartialEq))] +#[cfg_attr(test, derive(Clone, PartialEq))] pub struct WarcRecord { pub request: Request, pub response: Response, pub metadata: Metadata, } +#[cfg(test)] +impl Arbitrary for WarcRecord { + type Parameters = (); + type Strategy = BoxedStrategy; + + fn arbitrary_with(_args: ()) -> Self::Strategy { + (any::(), any::(), any::()) + .prop_map(|(request, response, metadata)| Self { + request, + response, + metadata, + }) + .boxed() + } +} + #[derive(Debug)] -#[cfg_attr(test, derive(Clone, Arbitrary, PartialEq))] +#[cfg_attr(test, derive(Clone, PartialEq))] pub struct Request { // WARC-Target-URI pub url: String, @@ -234,8 +250,17 @@ impl Request { } } +#[cfg(test)] +impl Arbitrary for Request { + type Parameters = (); + type Strategy = BoxedStrategy; + + fn arbitrary_with(_args: ()) -> Self::Strategy { + ".+".prop_map(|url| Self { url }).boxed() + } +} + #[derive(Debug, Clone, Copy, PartialEq, Eq)] -#[cfg_attr(test, derive(Arbitrary))] pub enum PayloadType { Html, Pdf, @@ -243,6 +268,22 @@ pub enum PayloadType { Atom, } +#[cfg(test)] +impl Arbitrary for PayloadType { + type Parameters = (); + type Strategy = BoxedStrategy; + + fn arbitrary_with(_args: ()) -> Self::Strategy { + prop_oneof![ + Just(Self::Html), + Just(Self::Pdf), + Just(Self::Rss), + Just(Self::Atom), + ] + .boxed() + } +} + impl FromStr for PayloadType { type Err = Error; @@ -273,7 +314,7 @@ impl Display for PayloadType { } #[derive(Debug)] -#[cfg_attr(test, derive(Clone, Arbitrary, PartialEq))] +#[cfg_attr(test, derive(Clone, PartialEq))] pub struct Response { pub body: String, pub payload_type: Option, @@ -297,8 +338,20 @@ impl Response { } } +#[cfg(test)] +impl Arbitrary for Response { + type Parameters = (); + type Strategy = BoxedStrategy; + + fn arbitrary_with(_args: ()) -> Self::Strategy { + (".+", any::>()) + .prop_map(|(body, payload_type)| Self { body, payload_type }) + .boxed() + } +} + #[derive(Debug)] -#[cfg_attr(test, derive(Clone, Arbitrary, PartialEq))] +#[cfg_attr(test, derive(Clone, PartialEq))] pub struct Metadata { // fetchTimeMs pub fetch_time_ms: u64, @@ -325,6 +378,18 @@ impl Metadata { } } +#[cfg(test)] +impl Arbitrary for Metadata { + type Parameters = (); + type Strategy = BoxedStrategy; + + fn arbitrary_with(_args: ()) -> Self::Strategy { + (0..10000u64) + .prop_map(|fetch_time_ms| Self { fetch_time_ms }) + .boxed() + } +} + pub struct RecordIterator { reader: BufReader>, num_reads: usize, @@ -671,8 +736,6 @@ mod tests { use super::*; use core::panic; - use proptest::prelude::*; - #[test] fn it_works() { let raw = b"\ diff --git a/crates/core/src/webpage/html/into_tantivy.rs b/crates/core/src/webpage/html/into_tantivy.rs index bcd4a0c9..f289a5af 100644 --- a/crates/core/src/webpage/html/into_tantivy.rs +++ b/crates/core/src/webpage/html/into_tantivy.rs @@ -74,7 +74,7 @@ impl Html { pub fn pretokenize_url_for_site_operator(&self) -> PreTokenizedString { self.pretokenize_string_with( self.url().to_string(), - tokenizer::Tokenizer::Url(tokenizer::UrlTokenizer), + tokenizer::FieldTokenizer::Url(tokenizer::fields::UrlTokenizer), ) } @@ -114,7 +114,7 @@ impl Html { fn pretokenize_string_with( &self, text: String, - tokenizer: tokenizer::Tokenizer, + tokenizer: tokenizer::FieldTokenizer, ) -> PreTokenizedString { let mut tokenizer = tokenizer; diff --git a/crates/core/src/webpage/schema_org/mod.rs b/crates/core/src/webpage/schema_org/mod.rs index 73c6d635..7287300f 100644 --- a/crates/core/src/webpage/schema_org/mod.rs +++ b/crates/core/src/webpage/schema_org/mod.rs @@ -18,7 +18,7 @@ use std::collections::HashMap; use kuchiki::NodeRef; -use crate::tokenizer::FlattenedJson; +use crate::tokenizer::fields::FlattenedJson; use crate::{OneOrMany, Result}; mod json_ld;