blopker
diff --git a/‎CHANGELOG.md
Lines changed: 8 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 8 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 39 additions & 2 deletions b/‎README.md
Lines changed: 39 additions & 2 deletions
diff --git a/‎codebook.toml
Lines changed: 1 addition & 1 deletion b/‎codebook.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/codebook-config/src/lib.rs
Lines changed: 37 additions & 38 deletions b/‎crates/codebook-config/src/lib.rs
Lines changed: 37 additions & 38 deletions
diff --git a/‎crates/codebook/src/lib.rs
Lines changed: 34 additions & 16 deletions b/‎crates/codebook/src/lib.rs
Lines changed: 34 additions & 16 deletions
@@ -1,3 +1,11 @@
+[0.3.0]
+
+Breaking changes:
+
+- User defined regex is now run on a file line-by-line instead of word-by-word. This means regex should likely not match the beginning of a line. For example to match DNA, this pattern used to work: `^[ATCG]+$`. This pattern will now need to be something like: `\\b[ATCG]+\\b` (double `\\` is for escaping in TOML)
+
+- Codebook will now ignore text like URLs and color hex codes by default. See README `User-Defined Regex Patterns` for more details.
+
 [0.2.13]
 
 - Switch out OpenSSL for rustls
 
@@ -221,11 +221,15 @@ flag_words = ["todo", "fixme"]
 ignore_paths = ["target/**/*", "**/*.json", ".git/**/*"]
 
 # List of regex patterns to ignore when spell checking
+# Patterns are matched against each line of text, not individual words
 # Useful for domain-specific strings or patterns
+# Note: Backslashes must be escaped in TOML (use \\ instead of \)
 # Default: []
 ignore_patterns = [
-    "^[ATCG]+$",             # DNA sequences
-    "\\d{3}-\\d{2}-\\d{4}"   # Social Security Number format
+    "\\b[ATCG]+\\b",             # DNA sequences
+    "\\d{3}-\\d{2}-\\d{4}",      # Social Security Number format
+    "^[A-Z]{2,}$",               # All caps words like "HTML", "CSS"
+    "https?://[^\\s]+"           # URLs
 ]
 
 # Whether to use global configuration (project config only)
@@ -248,6 +252,39 @@ use_global = true
 - Project settings are saved automatically when words are added
 - Configuration files are automatically reloaded when they change
 
+### User-Defined Regex Patterns
+
+The `ignore_patterns` configuration allows you to define custom regex patterns to skip during spell checking. Here are important details about how they work:
+
+**Default Patterns**: Codebook already includes built-in regex patterns for common technical strings, so you don't need to define these yourself:
+- URLs: `https?://[^\\s]+`
+- Hex colors: `#[0-9a-fA-F]{3,8}` (like `#deadbeef`, `#fff`)
+- Email addresses: `[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}`
+- File paths: `/[^\\s]*` (Unix) and `[A-Za-z]:\\\\[^\\s]*` (Windows)
+- UUIDs: `[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}`
+- Base64 strings: `[A-Za-z0-9+/]{20,}={0,2}` (20+ characters)
+- Git commit hashes: `\\b[0-9a-fA-F]{7,40}\\b`
+- Markdown links: `\\[([^\\]]+)\\]\\(([^)]+)\\)`
+
+**Line-by-Line Matching**: Regex patterns are applied to each line of text, not individual words. This means your patterns should account for the line context.
+
+**TOML Escaping**: Since configuration files use TOML format, backslashes in regex patterns must be escaped by doubling them:
+- Use `\\b` for word boundaries (not `\b`)
+- Use `\\d` for digits (not `\d`)
+- Use `\\\\` for literal backslashes (not `\\`)
+
+**Examples**:
+```toml
+ignore_patterns = [
+    "\\b[ATCG]+\\b",           # DNA sequences with word boundaries
+    "^\\s*//.*$",              # Comment lines starting with //
+    "https?://[^\\s]+",        # URLs (note the escaped \s)
+    "\\$[a-zA-Z_][a-zA-Z0-9_]*", # Variables starting with $
+]
+```
+
+**Migration Note**: If you're upgrading from an older version, patterns that used `^` and `$` anchors may need adjustment since matching now occurs line-by-line rather than word-by-word.
+
 ## Goals
 
 Spell checking is complicated and opinions about how it should be done, especially with code, differs. This section is about the trade offs that steer decisions.
 
@@ -22,4 +22,4 @@ ignore_paths = [
     "**/*.json",
     ".git/**/*",
 ]
-ignore_patterns = ["^[ATCG]+$"]
+ignore_patterns = ["\\b[ATCG]+\\b"]
@@ -3,7 +3,7 @@ use crate::settings::ConfigSettings;
 use glob::Pattern;
 use log::debug;
 use log::info;
-use regex::RegexSet;
+use regex::Regex;
 use std::env;
 use std::fs;
 use std::io;
@@ -30,7 +30,7 @@ pub struct CodebookConfig {
     /// Combined settings (global merged with project overrides)
     effective_settings: RwLock<ConfigSettings>,
     /// Compiled regex patterns for ignoring text
-    regex_set: RwLock<Option<RegexSet>>,
+    regex_cache: RwLock<Option<Vec<Regex>>>,
     /// Path to the project-specific config file
     pub project_config_path: Option<PathBuf>,
     project_config_state: RwLock<Option<ConfigFileState>>,
@@ -47,7 +47,7 @@ impl Default for CodebookConfig {
             project_settings: RwLock::new(ConfigSettings::default()),
             global_settings: RwLock::new(None),
             effective_settings: RwLock::new(ConfigSettings::default()),
-            regex_set: RwLock::new(None),
+            regex_cache: RwLock::new(None),
             project_config_path: None,
             project_config_state: RwLock::new(None),
             global_config_path: None,
@@ -356,7 +356,7 @@ impl CodebookConfig {
         }
 
         // Invalidate regex cache
-        *self.regex_set.write().unwrap() = None;
+        *self.regex_cache.write().unwrap() = None;
     }
 
     /// Add a word to the project configs allowlist
@@ -493,9 +493,6 @@ impl CodebookConfig {
 
     /// Check if a word is in the effective allowlist
     pub fn is_allowed_word(&self, word: &str) -> bool {
-        if self.matches_ignore_pattern(word) {
-            return true;
-        }
         let word = word.to_ascii_lowercase();
         self.effective_settings
             .read()
@@ -516,27 +513,26 @@ impl CodebookConfig {
             .any(|w| w == &word)
     }
 
-    /// Check if text matches any of the ignore patterns
-    fn matches_ignore_pattern(&self, word: &str) -> bool {
-        let patterns = &self.effective_settings.read().unwrap().ignore_patterns;
-        if patterns.is_empty() {
-            return false;
-        }
-
-        // Lazily initialize the RegexSet
-        let mut regex_set = self.regex_set.write().unwrap();
-        if regex_set.is_none() {
-            match RegexSet::new(patterns) {
-                Ok(set) => *regex_set = Some(set),
-                Err(_) => return false,
-            }
+    /// Get the list of user-defined ignore patterns
+    pub fn get_ignore_patterns(&self) -> Option<Vec<Regex>> {
+        let str_patterns = self
+            .effective_settings
+            .read()
+            .unwrap()
+            .ignore_patterns
+            .clone();
+
+        // Lazily initialize the Regex cache
+        let mut regex_cache = self.regex_cache.write().unwrap();
+        if regex_cache.is_none() {
+            let regex_set = str_patterns
+                .into_iter()
+                .map(|pattern| Regex::new(&pattern).unwrap())
+                .collect::<Vec<_>>();
+            *regex_cache = Some(regex_set);
         }
 
-        // Check if text matches any pattern
-        if let Some(set) = &*regex_set {
-            return set.is_match(word);
-        }
-        false
+        regex_cache.clone()
     }
 
     /// Clean the cache directory
@@ -702,12 +698,18 @@ mod tests {
         file.write_all(a.as_bytes())?;
 
         let config = load_from_file(ConfigType::Project, &config_path)?;
-        assert!(config.matches_ignore_pattern("GTAC"));
-        assert!(config.matches_ignore_pattern("AATTCCGG"));
-        assert!(config.matches_ignore_pattern("123-45-6789"));
-        assert!(!config.matches_ignore_pattern("Hello"));
-        assert!(!config.matches_ignore_pattern("GTACZ")); // Invalid DNA sequence
-
+        let patterns = config
+            .effective_settings
+            .read()
+            .unwrap()
+            .ignore_patterns
+            .clone();
+        assert!(patterns.contains(&String::from("^[ATCG]+$")));
+        assert!(patterns.contains(&String::from("\\d{3}-\\d{2}-\\d{4}")));
+        let reg = config.get_ignore_patterns();
+
+        let patterns = reg.as_ref().unwrap();
+        assert!(patterns.len() == 2);
         Ok(())
     }
 
@@ -728,8 +730,7 @@ mod tests {
         )?;
 
         let config = load_from_file(ConfigType::Project, &config_path)?;
-        assert!(config.matches_ignore_pattern("GTAC"));
-        assert!(!config.matches_ignore_pattern("123-45-6789"));
+        assert!(config.get_ignore_patterns().unwrap().len() == 1);
 
         // Update config with new pattern
         let mut file = File::create(&config_path)?;
@@ -743,8 +744,7 @@ mod tests {
 
         // Reload and verify both patterns work
         config.reload()?;
-        assert!(config.matches_ignore_pattern("GTAC"));
-        assert!(config.matches_ignore_pattern("123-45-6789"));
+        assert!(config.get_ignore_patterns().unwrap().len() == 2);
 
         // Update config to remove all patterns
         let mut file = File::create(&config_path)?;
@@ -757,8 +757,7 @@ mod tests {
 
         // Reload and verify no patterns match
         config.reload()?;
-        assert!(!config.matches_ignore_pattern("GTAC"));
-        assert!(!config.matches_ignore_pattern("123-45-6789"));
+        assert!(config.get_ignore_patterns().unwrap().is_empty());
 
         Ok(())
     }
 
@@ -2,8 +2,11 @@ pub mod dictionaries;
 mod logging;
 pub mod parser;
 pub mod queries;
+pub mod regexes;
 mod splitter;
 
+use regex::Regex;
+use regexes::get_default_skip_patterns;
 use std::sync::Arc;
 
 use codebook_config::CodebookConfig;
@@ -15,6 +18,7 @@ use parser::WordLocation;
 pub struct Codebook {
     config: Arc<CodebookConfig>,
     manager: DictionaryManager,
+    regex_cache: Vec<Regex>,
 }
 
 // Custom 'codebook' dictionary could be removed later for a more general solution.
@@ -23,7 +27,12 @@ static DEFAULT_DICTIONARIES: &[&str; 3] = &["codebook", "software_terms", "compu
 impl Codebook {
     pub fn new(config: Arc<CodebookConfig>) -> Result<Self, Box<dyn std::error::Error>> {
         let manager = DictionaryManager::new(&config.cache_dir);
-        Ok(Self { config, manager })
+        let regex_cache = get_default_skip_patterns();
+        Ok(Self {
+            config,
+            manager,
+            regex_cache,
+        })
     }
 
     /// Get WordLocations for a block of text.
@@ -42,23 +51,32 @@ impl Codebook {
         // call spell check on each dictionary
         let language = self.resolve_language(language, file_path);
         let dictionaries = self.get_dictionaries(Some(language));
-        parser::find_locations(text, language, |word| {
-            if self.config.should_flag_word(word) {
-                return false;
-            }
-            if word.len() < 3 {
-                return true;
-            }
-            if self.config.is_allowed_word(word) {
-                return true;
-            }
-            for dictionary in &dictionaries {
-                if dictionary.check(word) {
+        let mut regex_patterns = self.regex_cache.clone();
+        if let Some(config_patterns) = self.config.get_ignore_patterns() {
+            regex_patterns.extend(config_patterns);
+        }
+        parser::find_locations(
+            text,
+            language,
+            |word| {
+                if self.config.should_flag_word(word) {
+                    return false;
+                }
+                if word.len() < 3 {
                     return true;
                 }
-            }
-            false
-        })
+                if self.config.is_allowed_word(word) {
+                    return true;
+                }
+                for dictionary in &dictionaries {
+                    if dictionary.check(word) {
+                        return true;
+                    }
+                }
+                false
+            },
+            &regex_patterns,
+        )
     }
 
     fn resolve_language(
Original file line number	Diff line number	Diff line change
`@@ -22,4 +22,4 @@ ignore_paths = [`
`22`	`22`	`"*/.json",`
`23`	`23`	`".git/*/",`
`24`	`24`	`]`
`25`		`-ignore_patterns = ["^[ATCG]+$"]`
	`25`	`+ignore_patterns = ["\\b[ATCG]+\\b"]`