diff --git a/content/blog/codegen-learnings/flow.png b/content/blog/codegen-learnings/flow.png new file mode 100644 index 000000000000..da4173678736 Binary files /dev/null and b/content/blog/codegen-learnings/flow.png differ diff --git a/content/blog/codegen-learnings/flow.tldr b/content/blog/codegen-learnings/flow.tldr new file mode 100644 index 000000000000..a0bbc8ded9c9 --- /dev/null +++ b/content/blog/codegen-learnings/flow.tldr @@ -0,0 +1,267 @@ +{ + "tldrawFileFormatVersion": 1, + "schema": { + "schemaVersion": 2, + "sequences": { + "com.tldraw.store": 4, + "com.tldraw.asset": 1, + "com.tldraw.camera": 1, + "com.tldraw.document": 2, + "com.tldraw.instance": 25, + "com.tldraw.instance_page_state": 5, + "com.tldraw.page": 1, + "com.tldraw.instance_presence": 6, + "com.tldraw.pointer": 1, + "com.tldraw.shape": 4, + "com.tldraw.asset.bookmark": 2, + "com.tldraw.asset.image": 5, + "com.tldraw.asset.video": 5, + "com.tldraw.shape.group": 0, + "com.tldraw.shape.text": 2, + "com.tldraw.shape.bookmark": 2, + "com.tldraw.shape.draw": 2, + "com.tldraw.shape.geo": 9, + "com.tldraw.shape.note": 8, + "com.tldraw.shape.line": 5, + "com.tldraw.shape.frame": 0, + "com.tldraw.shape.arrow": 5, + "com.tldraw.shape.highlight": 1, + "com.tldraw.shape.embed": 4, + "com.tldraw.shape.image": 4, + "com.tldraw.shape.video": 2, + "com.tldraw.binding.arrow": 0 + } + }, + "records": [ + { + "gridSize": 10, + "name": "", + "meta": {}, + "id": "document:document", + "typeName": "document" + }, + { + "meta": {}, + "id": "page:page", + "name": "Page 1", + "index": "a1", + "typeName": "page" + }, + { + "id": "pointer:pointer", + "typeName": "pointer", + "x": 357.43359375, + "y": 132.7734375, + "lastActivityTimestamp": 1735267617634, + "meta": {} + }, + { + "followingUserId": null, + "opacityForNextShape": 1, + "stylesForNextShape": { + "tldraw:size": "s", + "tldraw:font": "sans", + "tldraw:geo": "arrow-right" + }, + "brush": null, + "scribbles": [], + "cursor": { + "type": "cross", + "rotation": 0 + }, + "isFocusMode": false, + "exportBackground": true, + "isDebugMode": false, + "isToolLocked": false, + "screenBounds": { + "x": 0, + "y": 0, + "w": 1128, + "h": 978 + }, + "insets": [ + false, + false, + true, + false + ], + "zoomBrush": null, + "isGridMode": false, + "isPenMode": false, + "chatMessage": "", + "isChatting": false, + "highlightedUserIds": [], + "isFocused": true, + "devicePixelRatio": 2, + "isCoarsePointer": false, + "isHoveringCanvas": true, + "openMenus": [], + "isChangingStyle": false, + "isReadonly": false, + "meta": {}, + "duplicateProps": null, + "id": "instance:instance", + "currentPageId": "page:page", + "typeName": "instance" + }, + { + "editingShapeId": null, + "croppingShapeId": null, + "selectedShapeIds": [ + "shape:0vPmbhCIhdgPr7gYnOxm9" + ], + "hoveredShapeId": "shape:0vPmbhCIhdgPr7gYnOxm9", + "erasingShapeIds": [], + "hintingShapeIds": [], + "focusedGroupId": null, + "meta": {}, + "id": "instance_page_state:page:page", + "pageId": "page:page", + "typeName": "instance_page_state" + }, + { + "x": 49.0078125, + "y": 97.7109375, + "z": 1, + "meta": {}, + "id": "camera:page:page", + "typeName": "camera" + }, + { + "x": 40.28125, + "y": 79.8203125, + "rotation": 0, + "isLocked": false, + "opacity": 1, + "meta": {}, + "id": "shape:BJfJwAyLKVoUOGaixU2Xf", + "type": "text", + "props": { + "color": "black", + "size": "s", + "w": 252.359375, + "text": "User query:\n\n\"Generate code for S3 Bucket\"", + "font": "sans", + "textAlign": "start", + "autoSize": true, + "scale": 1 + }, + "parentId": "page:page", + "index": "a1", + "typeName": "shape" + }, + { + "x": 374.58203125, + "y": 54.3125, + "rotation": 0, + "isLocked": false, + "opacity": 1, + "meta": {}, + "id": "shape:z2gtvIBdf6_eCX2gzQ2L1", + "type": "text", + "props": { + "color": "black", + "size": "s", + "w": 310.2265625, + "text": "Search terms:\n\n\"AWS S3 bucket\",\n\"Pulumi AWS S3\",\n\"create S3 bucket Pulumi TypeScript\"", + "font": "sans", + "textAlign": "start", + "autoSize": true, + "scale": 1 + }, + "parentId": "page:page", + "index": "a22v7", + "typeName": "shape" + }, + { + "x": 25.6796875, + "y": 58.140625, + "rotation": 0, + "isLocked": false, + "opacity": 1, + "meta": {}, + "id": "shape:PGJKCoOonmat14xrifIp0", + "type": "geo", + "props": { + "w": 286.8671875, + "h": 132.9296875, + "geo": "rectangle", + "color": "black", + "labelColor": "black", + "fill": "none", + "dash": "draw", + "size": "s", + "font": "sans", + "text": "", + "align": "middle", + "verticalAlign": "middle", + "growY": 0, + "url": "", + "scale": 1 + }, + "parentId": "page:page", + "index": "a39Cr", + "typeName": "shape" + }, + { + "x": 362.58203125, + "y": 35.85546875000003, + "rotation": 0, + "isLocked": false, + "opacity": 1, + "meta": {}, + "id": "shape:hrkCX2QL0NyXDz_xl3n_i", + "type": "geo", + "props": { + "w": 342.9609375, + "h": 172.41015624999997, + "geo": "rectangle", + "color": "black", + "labelColor": "black", + "fill": "none", + "dash": "draw", + "size": "s", + "font": "sans", + "text": "", + "align": "middle", + "verticalAlign": "middle", + "growY": 0, + "url": "", + "scale": 1 + }, + "parentId": "page:page", + "index": "a40nT", + "typeName": "shape" + }, + { + "x": 322.2734375, + "y": 100.125, + "rotation": 0, + "isLocked": false, + "opacity": 1, + "meta": {}, + "id": "shape:0vPmbhCIhdgPr7gYnOxm9", + "type": "geo", + "props": { + "w": 35.16015625, + "h": 32.6484375, + "geo": "arrow-right", + "color": "black", + "labelColor": "black", + "fill": "none", + "dash": "draw", + "size": "s", + "font": "sans", + "text": "", + "align": "middle", + "verticalAlign": "middle", + "growY": 0, + "url": "", + "scale": 1 + }, + "parentId": "page:page", + "index": "a55Ml", + "typeName": "shape" + } + ] +} \ No newline at end of file diff --git a/content/blog/codegen-learnings/index.md b/content/blog/codegen-learnings/index.md index 9084b265e325..c774138de6b2 100644 --- a/content/blog/codegen-learnings/index.md +++ b/content/blog/codegen-learnings/index.md @@ -82,32 +82,16 @@ Because you were looking for the word "pie", you also retrieved a recipe for a S Now let's formalize this a bit. Recall measures the ratio of the relevant documents retrieved to the total number of relevant docuemtns in RAG: -TODO - -old: - -$$Recall = \frac{N(Retrieved\_documents \cap Relevant\_documents)}{N(Relevant\_documents)}$$ - -fixed1: - -$$Recall = \frac{N(\text{Retrieved\_documents} \cap \text{Relevant\_documents})}{N(\text{Relevant\_documents})}$$ - -fixed2: - -$$Recall = \frac{N(Retrieved\text{\_}documents \cap Relevant\text{\_}documents)}{N(Relevant\text{\_}documents)}$$ - -alternative - -$$Recall = \frac{N(\text{RetrievedDocuments} \cap \text{RelevantDocuments})}{N(\text{RelevantDocuments})}$$ +$$Recall = \frac{N(Retrieved \cap Relevant)}{N(Relevant)}$$ Where -- $N(Retrieved\_documents \cap Relevant\_documents)$ is the number of documents that are both retrieved and relevant. -- $N(Relevant\_documents)$ is the total number of relevant documents in the database. +- $N(Retrieved \cap Relevant)$ is the number of documents that are both retrieved and relevant. +- $N(Relevant)$ is the total number of relevant documents in the database. Good recall means that many documents relevant to the query were retrieved. -$$Precision = \frac{N(Retrieved\_documents \cap Relevant\_documents)}{N(Retrieved\_documents)}$$ +$$Precision = \frac{N(Retrieved \cap Relevant)}{N(Retrieved)}$$ Where $N(Retrieved\_documents)$ is the total number of documents that were retrieved. @@ -117,17 +101,37 @@ Naturally, an effective RAG maximizes both the recall and the precision. It's [b ### Practical concerns -Precision and recall are essential in understanding the information retrieval quality, they are quite hard to measure in practice. Unlike a cookbook, Pulumi registry contains thousands of ever changing documents, and evaluating how many of them are relevant for every user-submitted query is impractical making recall evaluation for live traffic next to impossible. Things a little easier with precision, where we're dealing with a small number of documents, but even that metric requires a non-trivial evaluation of relevance, which requires an LLM call or a human judge where the number of documents is small. +Precision and recall are essential in understanding the information retrieval quality, but they are quite hard to measure in practice. Unlike a cookbook, Pulumi registry contains thousands of ever changing documents, and evaluating how many of them are relevant for every user-submitted query is impractical making recall evaluation for live traffic next to impossible. Things a little easier with precision, where we're dealing with a small number of documents, but even that metric requires a non-trivial evaluation of relevance, which requires an LLM call or a human judge where the number of documents is small. Fortunately, other metrics that often can effectively estimate retrieval quality have been developed. We have found a metric that can predict, with some degree of accuracy, whether the generated code will successfully compile. For this metric, we compare the _tokens_ present in the prompted produced by the LLM with the number of tokens present in the actually generated code. (By token here we understand a compiler token - an identifier such as the name of a class, method or a field and not a traditional LLM token concept), Intuitively, if a token present in the prompt also appears in the generated program, we can assume that the token effectively contributed to the generated program. Tokens in the generated program that were not part of the prompt are not necessarily wrong but they are less trusted (they can come from the LLM built-in knowledge or were, ahem, hallucinated) -$$prompt \ coverage = \frac{N(Tokens\_in\_prompt \cap Tokens\_in\_code)}{N(Tokens\_in\_code)} $$ +1: +$$prompt \ coverage = \frac{N(Tokens\_in\_prompt \cap Tokens\_in\_code)}{N(Tokens\_in\_code)}$$ + +2: +$$prompt \ coverage = \frac{N(\text{Tokens\_in\_prompt} \cap \text{Tokens\_in\_code})}{N(\text{Tokens\_in\_code})}$$ + +3: + +$$prompt \ coverage = \frac{N(\text{Tokens_in_prompt} \cap \text{Tokens_in_code})}{N(\text{Tokens_in_code})}$$ + +4: +$$prompt \ coverage = \frac{N(\text{Tokens in prompt} \cap \text{Tokens in code})}{N(\text{Tokens in code})}$$ Prompt coverage is a metric we can observe in production, and it's one of several metrics we use when updating providers to ensure we haven't regressed the quality of the RAG. +
+ +
+ Flow of blah +
+
+ +more text... +