Merge pull request #896 from helixml/fix/rag-hotfixes

lukemarsden · web-flow · commit d7bbca9f5c60 · 2025-03-17T18:30:00.000Z
Fix/rag hotfixes
diff --git a/api/pkg/prompts/templates/knowledge.tmpl b/api/pkg/prompts/templates/knowledge.tmpl
@@ -11,21 +11,47 @@ We have found the following context you may refer to in your answer:
 </article>
 {{- end }}
 
-Always provide references in the body of your answer in the format '[DOC_ID:DocumentID]'. For example, "The answer is 42 [DOC_ID:f6962c8007]."
+IMPORTANT: When referencing documents, always use EXACTLY the document_id values provided above. DO NOT extract or use page IDs from URLs within the content. Always provide references in the body of your answer in the format '[DOC_ID:DocumentID]'. For example, "The answer is 42 [DOC_ID:f6962c8007]." NOT "[DOC_ID:123456]" where 123456 might be a page ID in a URL.
 
 Always provide references in the body of your answer!
 
-After your answer, include one excerpt per document_id in XML format surrounded by three dashes like ---. These should be short sentence-long excerpts from the content that you referenced when answering the question, in the form below. Provide one excerpt per document. Provide one EXACT QUOTE per document. Do not include any other text inside the --- markers.
+After completing your answer, create an excerpt section with important quotes from each referenced document.
+
+Follow these steps:
+1. Identify each unique document_id you cited in your answer
+2. For each document, select a representative quote (1-2 sentences) that best supports your answer
+3. Include each document exactly once in the excerpt block using the format below
+
+⚠️ SYSTEM ERROR PREVENTION NOTICE ⚠️
+▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓
+The document database processes each document_id only once. Duplicate entries
+will trigger this error:
+
+ERROR: Duplicate document_id detected. Excerpt processing failed.
+Document with duplicate entries: [document_id]. Please provide exactly 
+one excerpt per document_id.
+▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓
+
+Required excerpt format:
 
 ---
 <excerpts>
 <excerpt>
-<document_id>[DocumentID]</document_id>
-<snippet>[Excerpt]</snippet>
+<document_id>document-id-you-cited</document_id>
+<snippet>A representative quote from this document that supports your answer.</snippet>
+</excerpt>
+<excerpt>
+<document_id>another-document-id-you-cited</document_id>
+<snippet>A key quote from this document that supports your answer.</snippet>
 </excerpt>
 </excerpts>
 ---
 
+FINAL CHECK:
+- Each document appears exactly once in your excerpts
+- No introductory text appears before the excerpt block
+- All document_ids match those you cited in your answer
+
 {{- end }}
 
 {{- if .KnowledgeResults }}
@@ -53,21 +79,47 @@ We have found the following context you may refer to in your answer:
 </article>
 {{- end }}
 
-Always provide references in the body of your answer in the format '[DOC_ID:DocumentID]'. For example, "The answer is 42 [DOC_ID:f6962c8007]."
+IMPORTANT: When referencing documents, always use EXACTLY the document_id values provided above. DO NOT extract or use page IDs from URLs within the content. Always provide references in the body of your answer in the format '[DOC_ID:DocumentID]'. For example, "The answer is 42 [DOC_ID:f6962c8007]." NOT "[DOC_ID:123456]" where 123456 might be a page ID in a URL.
 
 Always provide references in the body of your answer!
 
-After your answer, include one excerpt per document_id in XML format surrounded by three dashes like ---. These should be short sentence-long excerpts from the content that you referenced when answering the question, in the form below. Provide one excerpt per document. Provide one EXACT QUOTE per document. Do not include any other text inside the --- markers.
+After completing your answer, create an excerpt section with important quotes from each referenced document.
+
+Follow these steps:
+1. Identify each unique document_id you cited in your answer
+2. For each document, select a representative quote (1-2 sentences) that best supports your answer
+3. Include each document exactly once in the excerpt block using the format below
+
+⚠️ SYSTEM ERROR PREVENTION NOTICE ⚠️
+▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓
+The document database processes each document_id only once. Duplicate entries
+will trigger this error:
+
+ERROR: Duplicate document_id detected. Excerpt processing failed.
+Document with duplicate entries: [document_id]. Please provide exactly 
+one excerpt per document_id.
+▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓
+
+Required excerpt format:
 
 ---
 <excerpts>
 <excerpt>
-<document_id>[DocumentID]</document_id>
-<snippet>[Excerpt]</snippet>
+<document_id>document-id-you-cited</document_id>
+<snippet>A representative quote from this document that supports your answer.</snippet>
+</excerpt>
+<excerpt>
+<document_id>another-document-id-you-cited</document_id>
+<snippet>A key quote from this document that supports your answer.</snippet>
 </excerpt>
 </excerpts>
 ---
 
+FINAL CHECK:
+- Each document appears exactly once in your excerpts
+- No introductory text appears before the excerpt block
+- All document_ids match those you cited in your answer
+
 {{- end }}
 
 Here is the question from the user:
diff --git a/api/pkg/rag/rag_haystack.go b/api/pkg/rag/rag_haystack.go
@@ -10,6 +10,7 @@ import (
 	"net/http"
 	"path/filepath"
 	"strconv"
+	"strings"
 
 	"github.com/helixml/helix/api/pkg/types"
 	"github.com/rs/zerolog/log"
@@ -40,24 +41,22 @@ func (h *HaystackRAG) Index(ctx context.Context, chunks ...*types.SessionRAGInde
 
 	logger.Debug().Msg("Indexing documents")
 
-	for _, chunk := range chunks {
-		logger.Debug().
-			Str("data_entity_id", chunk.DataEntityID).
-			Str("document_id", chunk.DocumentID).
-			Msg("indexing chunk")
+	// Early exit if no chunks to index
+	if len(chunks) == 0 {
+		logger.Warn().Msg("no chunks to index, skipping")
+		return nil
+	}
 
-		// Metadata check before processing
-		if chunk.Metadata != nil {
-			logger.Info().
-				Str("document_id", chunk.DocumentID).
-				Interface("chunk_metadata", chunk.Metadata).
-				Msg("Chunk contains metadata")
-		} else {
-			logger.Info().
+	for _, chunk := range chunks {
+		// Skip chunks with empty content
+		if chunk.Content == "" {
+			logger.Warn().
 				Str("document_id", chunk.DocumentID).
-				Msg("Chunk does NOT contain metadata")
+				Msg("skipping chunk with empty content")
+			continue
 		}
 
+		// Create multipart/form-data
 		var b bytes.Buffer
 		w := multipart.NewWriter(&b)
 
@@ -67,15 +66,16 @@ func (h *HaystackRAG) Index(ctx context.Context, chunks ...*types.SessionRAGInde
 
 		logger.Debug().Str("filename", filename).Msg("Indexing file")
 
-		// Add the file as a part
+		// Create a form file for the document
 		part, err := w.CreateFormFile("file", filename)
 		if err != nil {
 			return fmt.Errorf("creating form file: %w", err)
 		}
 
+		// Write the content - preserve original content including any NUL bytes
 		_, err = part.Write([]byte(chunk.Content))
 		if err != nil {
-			return fmt.Errorf("writing file content: %w", err)
+			return fmt.Errorf("writing content: %w", err)
 		}
 
 		// Add metadata for the document
@@ -90,7 +90,7 @@ func (h *HaystackRAG) Index(ctx context.Context, chunks ...*types.SessionRAGInde
 			// Add other metadata as needed
 		}
 
-		// Add any custom metadata from the chunk
+		// Add user metadata if present
 		if chunk.Metadata != nil {
 			logger.Info().
 				Str("document_id", chunk.DocumentID).
@@ -196,6 +196,18 @@ func (h *HaystackRAG) Query(ctx context.Context, q *types.SessionRAGQuery) ([]*t
 		Interface("document_id_list", q.DocumentIDList).
 		Logger()
 
+	// Remove NUL bytes from the prompt first
+	sanitizedPrompt := removeNULBytes(q.Prompt)
+	if sanitizedPrompt != q.Prompt {
+		logger.Warn().Msg("query prompt contained NUL bytes that were removed")
+	}
+
+	// Check for empty prompt after sanitizing - return early with error
+	if sanitizedPrompt == "" {
+		logger.Error().Msg("empty query prompt received (or only NUL bytes), rejecting request")
+		return nil, fmt.Errorf("query prompt cannot be empty")
+	}
+
 	// Build document ID conditions
 	documentIDConditions := make([]Condition, len(q.DocumentIDList))
 	for i, documentID := range q.DocumentIDList {
@@ -208,7 +220,7 @@ func (h *HaystackRAG) Query(ctx context.Context, q *types.SessionRAGQuery) ([]*t
 
 	// Build the complete query request
 	queryReq := QueryRequest{
-		Query: q.Prompt,
+		Query: sanitizedPrompt,
 		TopK:  q.MaxResults,
 		Filters: QueryFilter{
 			Operator: "AND",
@@ -339,10 +351,18 @@ func (h *HaystackRAG) Delete(ctx context.Context, req *types.DeleteIndexRequest)
 
 	logger.Debug().Msg("Deleting documents from Haystack")
 
-	// Create delete request
+	// Create delete request with properly formatted filters
+	// The Haystack service expects filters with operator and conditions
 	deleteReq := map[string]interface{}{
 		"filters": map[string]interface{}{
-			"data_entity_id": req.DataEntityID,
+			"operator": "AND",
+			"conditions": []map[string]interface{}{
+				{
+					"field":    "meta.data_entity_id",
+					"operator": "==",
+					"value":    req.DataEntityID,
+				},
+			},
 		},
 	}
 
@@ -421,3 +441,8 @@ func toString(value interface{}) string {
 		return fmt.Sprint(v)
 	}
 }
+
+// removeNULBytes removes NUL bytes from a string
+func removeNULBytes(s string) string {
+	return strings.ReplaceAll(s, "\x00", "")
+}
diff --git a/api/pkg/rag/util.go b/api/pkg/rag/util.go
@@ -7,7 +7,8 @@ import (
 
 // Extract document IDs from the prompt
 func ParseDocumentIDs(prompt string) []string {
-	re := regexp.MustCompile(`\[DOC_ID:([0-9a-f]+)\]`)
+	// Updated regex to match any alphanumeric characters, not just digits
+	re := regexp.MustCompile(`\[DOC_ID:([a-zA-Z0-9_-]+)\]`)
 	matches := re.FindAllStringSubmatch(prompt, -1)
 
 	// Convert matches to slice of strings
diff --git a/haystack_service/app/api.py b/haystack_service/app/api.py
@@ -86,9 +86,25 @@ async def process_file(
     # Get file extension
     _, ext = os.path.splitext(file.filename)
     
-    # Save file temporarily
+    # Read the file content
+    content = await file.read()
+    
+    # Check for empty content
+    if not content:
+        logger.error("Empty file content received")
+        raise HTTPException(status_code=422, detail="Input validation error: File content cannot be empty")
+    
+    # For binary files like PDFs, we should NOT sanitize content as it will corrupt the file
+    # PDF files and other binary formats may contain NUL bytes as part of their format
+    # NUL bytes will be handled after text extraction in the converter
+    
+    # Only check if the content is ONLY NUL bytes (which would be invalid)
+    if content == b'\x00' * len(content):
+        logger.error("File contained only NUL bytes")
+        raise HTTPException(status_code=422, detail="Input validation error: File content cannot be empty (contained only NUL bytes)")
+    
+    # Save file temporarily with original binary content intact
     with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp:
-        content = await file.read()
         temp.write(content)
         temp_path = temp.name
     
@@ -125,9 +141,20 @@ async def extract_text(
     # Get file extension
     _, ext = os.path.splitext(file.filename)
     
-    # Save file temporarily
+    # Read the file content
+    content = await file.read()
+    
+    # Check for empty content
+    if not content:
+        logger.error("Empty file content received")
+        raise HTTPException(status_code=422, detail="Input validation error: File content cannot be empty")
+    
+    # For binary files like PDFs, we should NOT sanitize content as it will corrupt the file
+    # PDF files and other binary formats may contain NUL bytes as part of their format
+    # NUL bytes will be handled after text extraction in the converter
+    
+    # Save file temporarily with original binary content intact
     with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp:
-        content = await file.read()
         temp.write(content)
         temp_path = temp.name
     
@@ -150,12 +177,29 @@ async def query(
     """Query for relevant documents"""
     
     try:
+        # Check for empty query text
+        if not request.query or request.query.strip() == "":
+            raise HTTPException(status_code=422, detail="Input validation error: `query` cannot be empty")
+        
+        # Remove NUL bytes from query if present
+        sanitized_query = request.query.replace('\x00', '')
+        if sanitized_query != request.query:
+            logger.warning("Query contained NUL bytes that were removed")
+        
+        # Check again for emptiness after sanitizing
+        if not sanitized_query or sanitized_query.strip() == "":
+            logger.error("Query contained only NUL bytes")
+            raise HTTPException(status_code=422, detail="Input validation error: `query` cannot be empty (contained only NUL bytes)")
+            
         results = await service.query(
-            query_text=request.query,
+            query_text=sanitized_query,
             filters=request.filters,
             top_k=request.top_k
         )
         return {"results": results}
+    except HTTPException:
+        # Re-raise HTTP exceptions
+        raise
     except Exception as e:
         logger.error(f"Error querying: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Error querying: {str(e)}")
diff --git a/haystack_service/app/converters.py b/haystack_service/app/converters.py
@@ -80,6 +80,11 @@ def run(
                     ]
                     text = "\n\n".join(el for el in markdown_elements if el)
                 
+                # Filter out NUL bytes from text after extraction
+                if '\x00' in text:
+                    logger.warning(f"Filtered NUL bytes from document text extracted from {path}")
+                    text = text.replace('\x00', '')
+                
                 if text.strip():
                     # Create document with metadata
                     doc_meta = meta.copy()
diff --git a/haystack_service/app/service.py b/haystack_service/app/service.py
@@ -93,6 +93,7 @@ def _init_indexing_pipeline(self):
         splitter.warm_up()
         
         # Writer for the vector store (which now handles both embeddings and BM25)
+        # NUL bytes are filtered out in VectorchordDocumentStore.write_documents method
         vector_writer = DocumentWriter(
             document_store=self.document_store,
             policy=DuplicatePolicy.OVERWRITE  # Use overwrite policy to handle duplicate documents
@@ -330,6 +331,16 @@ async def query(self, query_text: str, filters: Dict[str, Any] = None, top_k: in
         Returns:
             List of dictionaries with document data
         """
+        # Remove NUL bytes from query if present
+        if "\x00" in query_text:
+            logger.warning("Query contained NUL bytes that will be removed")
+            query_text = query_text.replace("\x00", "")
+        
+        # Validate query text after sanitizing
+        if not query_text or query_text.strip() == "":
+            logger.error("Empty query text received or contained only NUL bytes")
+            raise ValueError("Query text cannot be empty")
+            
         logger.info(f"Querying with: '{query_text}', filters: {filters}, top_k: {top_k}")
         
         # Format filters correctly if they're provided
@@ -522,6 +533,12 @@ async def query(self, query_text: str, filters: Dict[str, Any] = None, top_k: in
                 documents = output.get("document_joiner", {}).get("documents", [])
                 logger.info(f"Document joiner returned {len(documents)} documents")
                 
+                # Filter out NUL bytes from document content
+                for doc in documents:
+                    if doc.content and '\x00' in doc.content:
+                        logger.warning(f"Filtering NUL bytes from retrieval result document: {doc.id}")
+                        doc.content = doc.content.replace('\x00', '')
+                
                 # Debug the joined results
                 for i, doc in enumerate(documents):
                     logger.info(f"DEBUG: Final joined result {i+1}: id={getattr(doc, 'id', 'unknown')}, "
diff --git a/haystack_service/app/vectorchord/document_store/document_store.py b/haystack_service/app/vectorchord/document_store/document_store.py
@@ -655,6 +655,13 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
         if not documents:
             return 0
 
+        # Filter out any NUL bytes in document content before writing to PostgreSQL
+        logger = logging.getLogger(__name__)
+        for doc in documents:
+            if doc.content and '\x00' in doc.content:
+                logger.warning(f"Document store: removing NUL bytes from document content before database write: {doc.id}")
+                doc.content = doc.content.replace('\x00', '')
+
         # Convert Document objects to Postgres compatible format
         pg_documents = self._from_haystack_to_pg_documents(documents)