Skip to content

Commit

Permalink
re-write tokenizer to not use logos anymore
Browse files Browse the repository at this point in the history
this should fix a reported stack overflow (might be related to maciejhirsz/logos#384) and should also make it easier to add additional scripts besides latin in the future
  • Loading branch information
mikkeldenker committed Jul 22, 2024
1 parent 76d7323 commit c4192af
Show file tree
Hide file tree
Showing 31 changed files with 2,181 additions and 1,228 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ tower-http = {version = "0.5.0", features = ["compression-gzip", "cors"]}
tracing = {version = "0.1.34", features = ["release_max_level_info"]}
tracing-subscriber = {version = "0.3.11", features = ["env-filter"]}
tracing-test = "0.2.4"
unicode-segmentation = "1.11.0"
url = {version = "2.4.0", features = ["serde"]}
utoipa = {version = "4.2.3", features = ["axum_extras"]}
utoipa-swagger-ui = {version = "7.0.0", features = ["axum"]}
Expand Down
1 change: 1 addition & 0 deletions crates/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ tower-http.workspace = true
tower.workspace = true
tracing-subscriber.workspace = true
tracing.workspace = true
unicode-segmentation.workspace = true
url.workspace = true
utoipa-swagger-ui.workspace = true
utoipa.workspace = true
Expand Down
17 changes: 15 additions & 2 deletions crates/core/src/ampc/dht/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,6 @@ pub mod tests {
use openraft::{error::InitializeError, Config};

use proptest::prelude::*;
use proptest_derive::Arbitrary;

use futures::{pin_mut, TryStreamExt};
use rand::seq::SliceRandom;
Expand Down Expand Up @@ -557,7 +556,6 @@ pub mod tests {
bincode::Encode,
bincode::Decode,
PartialEq,
Arbitrary,
)]
enum Action {
Set { key: String, value: String },
Expand All @@ -566,6 +564,21 @@ pub mod tests {
Get { prev_key: usize },
}

impl Arbitrary for Action {
type Parameters = ();
type Strategy = BoxedStrategy<Self>;

fn arbitrary_with(_args: ()) -> Self::Strategy {
prop_oneof![
(".{1,10}", ".{1,10}").prop_map(|(key, value)| Action::Set { key, value }),
(0..1000).prop_map(|prev_key| Action::Get {
prev_key: prev_key as usize
}),
]
.boxed()
}
}

proptest! {
#![proptest_config(ProptestConfig::with_cases(10))]

Expand Down
17 changes: 15 additions & 2 deletions crates/core/src/distributed/sonic/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,6 @@ mod tests {
use std::{collections::HashMap, future::Future};

use proptest::prelude::*;
use proptest_derive::Arbitrary;

use crate::free_socket_addr;

Expand Down Expand Up @@ -343,12 +342,26 @@ mod tests {
})
}

#[derive(Debug, Clone, bincode::Encode, bincode::Decode, PartialEq, Arbitrary)]
#[derive(Debug, Clone, bincode::Encode, bincode::Decode, PartialEq)]
struct Message {
text: String,
other: HashMap<String, f32>,
}

impl Arbitrary for Message {
type Parameters = ();
type Strategy = BoxedStrategy<Self>;

fn arbitrary_with(_args: ()) -> Self::Strategy {
(
any::<String>(),
prop::collection::hash_map(".*", 0.0f32..100.0f32, 0..10),
)
.prop_map(|(text, other)| Message { text, other })
.boxed()
}
}

proptest! {
#[test]
fn basic_arb(a1: Message, b1: Message) {
Expand Down
21 changes: 12 additions & 9 deletions crates/core/src/distributed/sonic/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -335,9 +335,8 @@ mod tests {
mod counter_service {
use std::sync::atomic::AtomicI32;

use proptest_derive::Arbitrary;

use super::super::Message;
use proptest::prelude::*;

pub struct CounterService {
pub counter: AtomicI32,
Expand All @@ -346,17 +345,21 @@ mod tests {
sonic_service!(CounterService, [Change, Reset]);

#[derive(
Debug,
Clone,
serde::Serialize,
serde::Deserialize,
bincode::Encode,
bincode::Decode,
Arbitrary,
Debug, Clone, serde::Serialize, serde::Deserialize, bincode::Encode, bincode::Decode,
)]
pub struct Change {
pub amount: i32,
}

impl Arbitrary for Change {
type Parameters = ();
type Strategy = BoxedStrategy<Self>;

fn arbitrary_with(_args: Self::Parameters) -> Self::Strategy {
(0..100).prop_map(|amount| Change { amount }).boxed()
}
}

#[derive(
Debug, Clone, serde::Serialize, serde::Deserialize, bincode::Encode, bincode::Decode,
)]
Expand Down
12 changes: 6 additions & 6 deletions crates/core/src/entity_index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ use tantivy::{
use crate::{
image_store::{EntityImageStore, Image, ImageStore},
inverted_index::merge_tantivy_segments,
tokenizer::Normal,
tokenizer::fields::DefaultTokenizer,
Result,
};

Expand All @@ -49,7 +49,7 @@ fn schema() -> Schema {
TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer(Normal::as_str())
.set_tokenizer(DefaultTokenizer::as_str())
.set_index_option(IndexRecordOption::WithFreqsAndPositions),
)
.set_stored(),
Expand All @@ -59,7 +59,7 @@ fn schema() -> Schema {
TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer(Normal::as_str())
.set_tokenizer(DefaultTokenizer::as_str())
.set_index_option(IndexRecordOption::WithFreqsAndPositions),
)
.set_stored(),
Expand Down Expand Up @@ -164,8 +164,8 @@ impl EntityIndex {
.collect();

tantivy_index.tokenizers().register(
Normal::as_str(),
Normal::with_stopwords(stopwords.clone().into_iter().collect()),
DefaultTokenizer::as_str(),
DefaultTokenizer::with_stopwords(stopwords.clone().into_iter().collect()),
);

let image_store = EntityImageStore::open(path.as_ref().join("images"));
Expand Down Expand Up @@ -270,7 +270,7 @@ impl EntityIndex {
let entity_abstract = self.schema.get_field("abstract").unwrap();

let mut term_queries = Vec::new();
let mut tokenizer = Normal::default();
let mut tokenizer = DefaultTokenizer::default();
let mut stream = tokenizer.token_stream(query);
while let Some(token) = stream.next() {
if self.stopwords.contains(&token.text) {
Expand Down
6 changes: 3 additions & 3 deletions crates/core/src/feed/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use std::{

use crate::{
inverted_index::merge_tantivy_segments,
tokenizer::{Tokenizer, UrlTokenizer},
tokenizer::fields::{FieldTokenizer, UrlTokenizer},
};
use anyhow::Result;
use hashbrown::HashSet;
Expand All @@ -45,8 +45,8 @@ pub struct FeedIndex {

impl FeedIndex {
pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
let url_tokenizer = Tokenizer::Url(UrlTokenizer);
let kind_tokenizer = Tokenizer::default();
let url_tokenizer = FieldTokenizer::Url(UrlTokenizer);
let kind_tokenizer = FieldTokenizer::default();

let mut builder = tantivy::schema::Schema::builder();

Expand Down
18 changes: 9 additions & 9 deletions crates/core/src/inverted_index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,14 @@ use crate::ranking::initial::Score;
use crate::schema::text_field::TextField;
use crate::schema::{numerical_field, text_field, Field, NumericalFieldEnum, TextFieldEnum};
use crate::snippet::TextSnippet;
use crate::tokenizer::{
use crate::tokenizer::fields::{
BigramTokenizer, Identity, JsonField, Stemmed, TrigramTokenizer, UrlTokenizer,
};
use crate::webpage::region::Region;

use crate::webpage::schema_org;
use crate::Result;
use crate::{schema::create_schema, tokenizer::Tokenizer};
use crate::{schema::create_schema, tokenizer::FieldTokenizer};
use std::fs;
use std::path::Path;
use std::sync::Arc;
Expand Down Expand Up @@ -109,25 +109,25 @@ impl From<DocAddress> for tantivy::DocAddress {
}

fn register_tokenizers(manager: &TokenizerManager) {
let tokenizer = Tokenizer::default();
let tokenizer = FieldTokenizer::default();
manager.register(tokenizer.as_str(), tokenizer);

let tokenizer = Tokenizer::Stemmed(Stemmed::default());
let tokenizer = FieldTokenizer::Stemmed(Stemmed::default());
manager.register(tokenizer.as_str(), tokenizer);

let tokenizer = Tokenizer::Identity(Identity::default());
let tokenizer = FieldTokenizer::Identity(Identity::default());
manager.register(tokenizer.as_str(), tokenizer);

let tokenizer = Tokenizer::Bigram(BigramTokenizer::default());
let tokenizer = FieldTokenizer::Bigram(BigramTokenizer::default());
manager.register(tokenizer.as_str(), tokenizer);

let tokenizer = Tokenizer::Trigram(TrigramTokenizer::default());
let tokenizer = FieldTokenizer::Trigram(TrigramTokenizer::default());
manager.register(tokenizer.as_str(), tokenizer);

let tokenizer = Tokenizer::Url(UrlTokenizer);
let tokenizer = FieldTokenizer::Url(UrlTokenizer);
manager.register(tokenizer.as_str(), tokenizer);

let tokenizer = Tokenizer::Json(JsonField);
let tokenizer = FieldTokenizer::Json(JsonField);
manager.register(tokenizer.as_str(), tokenizer);
}

Expand Down
Loading

1 comment on commit c4192af

@mikkeldenker
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If any of the logos maintainers are reading: Sorry, I didn't mean to create a reference in your issue. Just wanted to add the link here in case it would be useful in the future but didn't realise it would create a reference in your issue as well

Please sign in to comment.