Skip to content

Commit b1c7cbd

Browse files
authored
More flexible regex matching system (#80)
* Add tests * Tests passing * Simplify parser logic * Add user-defined regex to new regex check pass * Update README and changelog
1 parent e985c0a commit b1c7cbd

File tree

11 files changed

+642
-127
lines changed

11 files changed

+642
-127
lines changed

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
[0.3.0]
2+
3+
Breaking changes:
4+
5+
- User defined regex is now run on a file line-by-line instead of word-by-word. This means regex should likely not match the beginning of a line. For example to match DNA, this pattern used to work: `^[ATCG]+$`. This pattern will now need to be something like: `\\b[ATCG]+\\b` (double `\\` is for escaping in TOML)
6+
7+
- Codebook will now ignore text like URLs and color hex codes by default. See README `User-Defined Regex Patterns` for more details.
8+
19
[0.2.13]
210

311
- Switch out OpenSSL for rustls

README.md

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -221,11 +221,15 @@ flag_words = ["todo", "fixme"]
221221
ignore_paths = ["target/**/*", "**/*.json", ".git/**/*"]
222222

223223
# List of regex patterns to ignore when spell checking
224+
# Patterns are matched against each line of text, not individual words
224225
# Useful for domain-specific strings or patterns
226+
# Note: Backslashes must be escaped in TOML (use \\ instead of \)
225227
# Default: []
226228
ignore_patterns = [
227-
"^[ATCG]+$", # DNA sequences
228-
"\\d{3}-\\d{2}-\\d{4}" # Social Security Number format
229+
"\\b[ATCG]+\\b", # DNA sequences
230+
"\\d{3}-\\d{2}-\\d{4}", # Social Security Number format
231+
"^[A-Z]{2,}$", # All caps words like "HTML", "CSS"
232+
"https?://[^\\s]+" # URLs
229233
]
230234

231235
# Whether to use global configuration (project config only)
@@ -248,6 +252,39 @@ use_global = true
248252
- Project settings are saved automatically when words are added
249253
- Configuration files are automatically reloaded when they change
250254

255+
### User-Defined Regex Patterns
256+
257+
The `ignore_patterns` configuration allows you to define custom regex patterns to skip during spell checking. Here are important details about how they work:
258+
259+
**Default Patterns**: Codebook already includes built-in regex patterns for common technical strings, so you don't need to define these yourself:
260+
- URLs: `https?://[^\\s]+`
261+
- Hex colors: `#[0-9a-fA-F]{3,8}` (like `#deadbeef`, `#fff`)
262+
- Email addresses: `[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}`
263+
- File paths: `/[^\\s]*` (Unix) and `[A-Za-z]:\\\\[^\\s]*` (Windows)
264+
- UUIDs: `[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}`
265+
- Base64 strings: `[A-Za-z0-9+/]{20,}={0,2}` (20+ characters)
266+
- Git commit hashes: `\\b[0-9a-fA-F]{7,40}\\b`
267+
- Markdown links: `\\[([^\\]]+)\\]\\(([^)]+)\\)`
268+
269+
**Line-by-Line Matching**: Regex patterns are applied to each line of text, not individual words. This means your patterns should account for the line context.
270+
271+
**TOML Escaping**: Since configuration files use TOML format, backslashes in regex patterns must be escaped by doubling them:
272+
- Use `\\b` for word boundaries (not `\b`)
273+
- Use `\\d` for digits (not `\d`)
274+
- Use `\\\\` for literal backslashes (not `\\`)
275+
276+
**Examples**:
277+
```toml
278+
ignore_patterns = [
279+
"\\b[ATCG]+\\b", # DNA sequences with word boundaries
280+
"^\\s*//.*$", # Comment lines starting with //
281+
"https?://[^\\s]+", # URLs (note the escaped \s)
282+
"\\$[a-zA-Z_][a-zA-Z0-9_]*", # Variables starting with $
283+
]
284+
```
285+
286+
**Migration Note**: If you're upgrading from an older version, patterns that used `^` and `$` anchors may need adjustment since matching now occurs line-by-line rather than word-by-word.
287+
251288
## Goals
252289

253290
Spell checking is complicated and opinions about how it should be done, especially with code, differs. This section is about the trade offs that steer decisions.

codebook.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,4 @@ ignore_paths = [
2222
"**/*.json",
2323
".git/**/*",
2424
]
25-
ignore_patterns = ["^[ATCG]+$"]
25+
ignore_patterns = ["\\b[ATCG]+\\b"]

crates/codebook-config/src/lib.rs

Lines changed: 37 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use crate::settings::ConfigSettings;
33
use glob::Pattern;
44
use log::debug;
55
use log::info;
6-
use regex::RegexSet;
6+
use regex::Regex;
77
use std::env;
88
use std::fs;
99
use std::io;
@@ -30,7 +30,7 @@ pub struct CodebookConfig {
3030
/// Combined settings (global merged with project overrides)
3131
effective_settings: RwLock<ConfigSettings>,
3232
/// Compiled regex patterns for ignoring text
33-
regex_set: RwLock<Option<RegexSet>>,
33+
regex_cache: RwLock<Option<Vec<Regex>>>,
3434
/// Path to the project-specific config file
3535
pub project_config_path: Option<PathBuf>,
3636
project_config_state: RwLock<Option<ConfigFileState>>,
@@ -47,7 +47,7 @@ impl Default for CodebookConfig {
4747
project_settings: RwLock::new(ConfigSettings::default()),
4848
global_settings: RwLock::new(None),
4949
effective_settings: RwLock::new(ConfigSettings::default()),
50-
regex_set: RwLock::new(None),
50+
regex_cache: RwLock::new(None),
5151
project_config_path: None,
5252
project_config_state: RwLock::new(None),
5353
global_config_path: None,
@@ -356,7 +356,7 @@ impl CodebookConfig {
356356
}
357357

358358
// Invalidate regex cache
359-
*self.regex_set.write().unwrap() = None;
359+
*self.regex_cache.write().unwrap() = None;
360360
}
361361

362362
/// Add a word to the project configs allowlist
@@ -493,9 +493,6 @@ impl CodebookConfig {
493493

494494
/// Check if a word is in the effective allowlist
495495
pub fn is_allowed_word(&self, word: &str) -> bool {
496-
if self.matches_ignore_pattern(word) {
497-
return true;
498-
}
499496
let word = word.to_ascii_lowercase();
500497
self.effective_settings
501498
.read()
@@ -516,27 +513,26 @@ impl CodebookConfig {
516513
.any(|w| w == &word)
517514
}
518515

519-
/// Check if text matches any of the ignore patterns
520-
fn matches_ignore_pattern(&self, word: &str) -> bool {
521-
let patterns = &self.effective_settings.read().unwrap().ignore_patterns;
522-
if patterns.is_empty() {
523-
return false;
524-
}
525-
526-
// Lazily initialize the RegexSet
527-
let mut regex_set = self.regex_set.write().unwrap();
528-
if regex_set.is_none() {
529-
match RegexSet::new(patterns) {
530-
Ok(set) => *regex_set = Some(set),
531-
Err(_) => return false,
532-
}
516+
/// Get the list of user-defined ignore patterns
517+
pub fn get_ignore_patterns(&self) -> Option<Vec<Regex>> {
518+
let str_patterns = self
519+
.effective_settings
520+
.read()
521+
.unwrap()
522+
.ignore_patterns
523+
.clone();
524+
525+
// Lazily initialize the Regex cache
526+
let mut regex_cache = self.regex_cache.write().unwrap();
527+
if regex_cache.is_none() {
528+
let regex_set = str_patterns
529+
.into_iter()
530+
.map(|pattern| Regex::new(&pattern).unwrap())
531+
.collect::<Vec<_>>();
532+
*regex_cache = Some(regex_set);
533533
}
534534

535-
// Check if text matches any pattern
536-
if let Some(set) = &*regex_set {
537-
return set.is_match(word);
538-
}
539-
false
535+
regex_cache.clone()
540536
}
541537

542538
/// Clean the cache directory
@@ -702,12 +698,18 @@ mod tests {
702698
file.write_all(a.as_bytes())?;
703699

704700
let config = load_from_file(ConfigType::Project, &config_path)?;
705-
assert!(config.matches_ignore_pattern("GTAC"));
706-
assert!(config.matches_ignore_pattern("AATTCCGG"));
707-
assert!(config.matches_ignore_pattern("123-45-6789"));
708-
assert!(!config.matches_ignore_pattern("Hello"));
709-
assert!(!config.matches_ignore_pattern("GTACZ")); // Invalid DNA sequence
710-
701+
let patterns = config
702+
.effective_settings
703+
.read()
704+
.unwrap()
705+
.ignore_patterns
706+
.clone();
707+
assert!(patterns.contains(&String::from("^[ATCG]+$")));
708+
assert!(patterns.contains(&String::from("\\d{3}-\\d{2}-\\d{4}")));
709+
let reg = config.get_ignore_patterns();
710+
711+
let patterns = reg.as_ref().unwrap();
712+
assert!(patterns.len() == 2);
711713
Ok(())
712714
}
713715

@@ -728,8 +730,7 @@ mod tests {
728730
)?;
729731

730732
let config = load_from_file(ConfigType::Project, &config_path)?;
731-
assert!(config.matches_ignore_pattern("GTAC"));
732-
assert!(!config.matches_ignore_pattern("123-45-6789"));
733+
assert!(config.get_ignore_patterns().unwrap().len() == 1);
733734

734735
// Update config with new pattern
735736
let mut file = File::create(&config_path)?;
@@ -743,8 +744,7 @@ mod tests {
743744

744745
// Reload and verify both patterns work
745746
config.reload()?;
746-
assert!(config.matches_ignore_pattern("GTAC"));
747-
assert!(config.matches_ignore_pattern("123-45-6789"));
747+
assert!(config.get_ignore_patterns().unwrap().len() == 2);
748748

749749
// Update config to remove all patterns
750750
let mut file = File::create(&config_path)?;
@@ -757,8 +757,7 @@ mod tests {
757757

758758
// Reload and verify no patterns match
759759
config.reload()?;
760-
assert!(!config.matches_ignore_pattern("GTAC"));
761-
assert!(!config.matches_ignore_pattern("123-45-6789"));
760+
assert!(config.get_ignore_patterns().unwrap().is_empty());
762761

763762
Ok(())
764763
}

crates/codebook/src/lib.rs

Lines changed: 34 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@ pub mod dictionaries;
22
mod logging;
33
pub mod parser;
44
pub mod queries;
5+
pub mod regexes;
56
mod splitter;
67

8+
use regex::Regex;
9+
use regexes::get_default_skip_patterns;
710
use std::sync::Arc;
811

912
use codebook_config::CodebookConfig;
@@ -15,6 +18,7 @@ use parser::WordLocation;
1518
pub struct Codebook {
1619
config: Arc<CodebookConfig>,
1720
manager: DictionaryManager,
21+
regex_cache: Vec<Regex>,
1822
}
1923

2024
// Custom 'codebook' dictionary could be removed later for a more general solution.
@@ -23,7 +27,12 @@ static DEFAULT_DICTIONARIES: &[&str; 3] = &["codebook", "software_terms", "compu
2327
impl Codebook {
2428
pub fn new(config: Arc<CodebookConfig>) -> Result<Self, Box<dyn std::error::Error>> {
2529
let manager = DictionaryManager::new(&config.cache_dir);
26-
Ok(Self { config, manager })
30+
let regex_cache = get_default_skip_patterns();
31+
Ok(Self {
32+
config,
33+
manager,
34+
regex_cache,
35+
})
2736
}
2837

2938
/// Get WordLocations for a block of text.
@@ -42,23 +51,32 @@ impl Codebook {
4251
// call spell check on each dictionary
4352
let language = self.resolve_language(language, file_path);
4453
let dictionaries = self.get_dictionaries(Some(language));
45-
parser::find_locations(text, language, |word| {
46-
if self.config.should_flag_word(word) {
47-
return false;
48-
}
49-
if word.len() < 3 {
50-
return true;
51-
}
52-
if self.config.is_allowed_word(word) {
53-
return true;
54-
}
55-
for dictionary in &dictionaries {
56-
if dictionary.check(word) {
54+
let mut regex_patterns = self.regex_cache.clone();
55+
if let Some(config_patterns) = self.config.get_ignore_patterns() {
56+
regex_patterns.extend(config_patterns);
57+
}
58+
parser::find_locations(
59+
text,
60+
language,
61+
|word| {
62+
if self.config.should_flag_word(word) {
63+
return false;
64+
}
65+
if word.len() < 3 {
5766
return true;
5867
}
59-
}
60-
false
61-
})
68+
if self.config.is_allowed_word(word) {
69+
return true;
70+
}
71+
for dictionary in &dictionaries {
72+
if dictionary.check(word) {
73+
return true;
74+
}
75+
}
76+
false
77+
},
78+
&regex_patterns,
79+
)
6280
}
6381

6482
fn resolve_language(

0 commit comments

Comments
 (0)