Skip to content

Commit

Permalink
Fix formula, add flow diagram
Browse files Browse the repository at this point in the history
  • Loading branch information
arturl committed Dec 27, 2024
1 parent 1df9502 commit 5268b2c
Show file tree
Hide file tree
Showing 3 changed files with 293 additions and 22 deletions.
Binary file added content/blog/codegen-learnings/flow.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
267 changes: 267 additions & 0 deletions content/blog/codegen-learnings/flow.tldr
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
{
"tldrawFileFormatVersion": 1,
"schema": {
"schemaVersion": 2,
"sequences": {
"com.tldraw.store": 4,
"com.tldraw.asset": 1,
"com.tldraw.camera": 1,
"com.tldraw.document": 2,
"com.tldraw.instance": 25,
"com.tldraw.instance_page_state": 5,
"com.tldraw.page": 1,
"com.tldraw.instance_presence": 6,
"com.tldraw.pointer": 1,
"com.tldraw.shape": 4,
"com.tldraw.asset.bookmark": 2,
"com.tldraw.asset.image": 5,
"com.tldraw.asset.video": 5,
"com.tldraw.shape.group": 0,
"com.tldraw.shape.text": 2,
"com.tldraw.shape.bookmark": 2,
"com.tldraw.shape.draw": 2,
"com.tldraw.shape.geo": 9,
"com.tldraw.shape.note": 8,
"com.tldraw.shape.line": 5,
"com.tldraw.shape.frame": 0,
"com.tldraw.shape.arrow": 5,
"com.tldraw.shape.highlight": 1,
"com.tldraw.shape.embed": 4,
"com.tldraw.shape.image": 4,
"com.tldraw.shape.video": 2,
"com.tldraw.binding.arrow": 0
}
},
"records": [
{
"gridSize": 10,
"name": "",
"meta": {},
"id": "document:document",
"typeName": "document"
},
{
"meta": {},
"id": "page:page",
"name": "Page 1",
"index": "a1",
"typeName": "page"
},
{
"id": "pointer:pointer",
"typeName": "pointer",
"x": 357.43359375,
"y": 132.7734375,
"lastActivityTimestamp": 1735267617634,
"meta": {}
},
{
"followingUserId": null,
"opacityForNextShape": 1,
"stylesForNextShape": {
"tldraw:size": "s",
"tldraw:font": "sans",
"tldraw:geo": "arrow-right"
},
"brush": null,
"scribbles": [],
"cursor": {
"type": "cross",
"rotation": 0
},
"isFocusMode": false,
"exportBackground": true,
"isDebugMode": false,
"isToolLocked": false,
"screenBounds": {
"x": 0,
"y": 0,
"w": 1128,
"h": 978
},
"insets": [
false,
false,
true,
false
],
"zoomBrush": null,
"isGridMode": false,
"isPenMode": false,
"chatMessage": "",
"isChatting": false,
"highlightedUserIds": [],
"isFocused": true,
"devicePixelRatio": 2,
"isCoarsePointer": false,
"isHoveringCanvas": true,
"openMenus": [],
"isChangingStyle": false,
"isReadonly": false,
"meta": {},
"duplicateProps": null,
"id": "instance:instance",
"currentPageId": "page:page",
"typeName": "instance"
},
{
"editingShapeId": null,
"croppingShapeId": null,
"selectedShapeIds": [
"shape:0vPmbhCIhdgPr7gYnOxm9"
],
"hoveredShapeId": "shape:0vPmbhCIhdgPr7gYnOxm9",
"erasingShapeIds": [],
"hintingShapeIds": [],
"focusedGroupId": null,
"meta": {},
"id": "instance_page_state:page:page",
"pageId": "page:page",
"typeName": "instance_page_state"
},
{
"x": 49.0078125,
"y": 97.7109375,
"z": 1,
"meta": {},
"id": "camera:page:page",
"typeName": "camera"
},
{
"x": 40.28125,
"y": 79.8203125,
"rotation": 0,
"isLocked": false,
"opacity": 1,
"meta": {},
"id": "shape:BJfJwAyLKVoUOGaixU2Xf",
"type": "text",
"props": {
"color": "black",
"size": "s",
"w": 252.359375,
"text": "User query:\n\n\"Generate code for S3 Bucket\"",
"font": "sans",
"textAlign": "start",
"autoSize": true,
"scale": 1
},
"parentId": "page:page",
"index": "a1",
"typeName": "shape"
},
{
"x": 374.58203125,
"y": 54.3125,
"rotation": 0,
"isLocked": false,
"opacity": 1,
"meta": {},
"id": "shape:z2gtvIBdf6_eCX2gzQ2L1",
"type": "text",
"props": {
"color": "black",
"size": "s",
"w": 310.2265625,
"text": "Search terms:\n\n\"AWS S3 bucket\",\n\"Pulumi AWS S3\",\n\"create S3 bucket Pulumi TypeScript\"",
"font": "sans",
"textAlign": "start",
"autoSize": true,
"scale": 1
},
"parentId": "page:page",
"index": "a22v7",
"typeName": "shape"
},
{
"x": 25.6796875,
"y": 58.140625,
"rotation": 0,
"isLocked": false,
"opacity": 1,
"meta": {},
"id": "shape:PGJKCoOonmat14xrifIp0",
"type": "geo",
"props": {
"w": 286.8671875,
"h": 132.9296875,
"geo": "rectangle",
"color": "black",
"labelColor": "black",
"fill": "none",
"dash": "draw",
"size": "s",
"font": "sans",
"text": "",
"align": "middle",
"verticalAlign": "middle",
"growY": 0,
"url": "",
"scale": 1
},
"parentId": "page:page",
"index": "a39Cr",
"typeName": "shape"
},
{
"x": 362.58203125,
"y": 35.85546875000003,
"rotation": 0,
"isLocked": false,
"opacity": 1,
"meta": {},
"id": "shape:hrkCX2QL0NyXDz_xl3n_i",
"type": "geo",
"props": {
"w": 342.9609375,
"h": 172.41015624999997,
"geo": "rectangle",
"color": "black",
"labelColor": "black",
"fill": "none",
"dash": "draw",
"size": "s",
"font": "sans",
"text": "",
"align": "middle",
"verticalAlign": "middle",
"growY": 0,
"url": "",
"scale": 1
},
"parentId": "page:page",
"index": "a40nT",
"typeName": "shape"
},
{
"x": 322.2734375,
"y": 100.125,
"rotation": 0,
"isLocked": false,
"opacity": 1,
"meta": {},
"id": "shape:0vPmbhCIhdgPr7gYnOxm9",
"type": "geo",
"props": {
"w": 35.16015625,
"h": 32.6484375,
"geo": "arrow-right",
"color": "black",
"labelColor": "black",
"fill": "none",
"dash": "draw",
"size": "s",
"font": "sans",
"text": "",
"align": "middle",
"verticalAlign": "middle",
"growY": 0,
"url": "",
"scale": 1
},
"parentId": "page:page",
"index": "a55Ml",
"typeName": "shape"
}
]
}
48 changes: 26 additions & 22 deletions content/blog/codegen-learnings/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,32 +82,16 @@ Because you were looking for the word "pie", you also retrieved a recipe for a S

Now let's formalize this a bit. Recall measures the ratio of the relevant documents retrieved to the total number of relevant docuemtns in RAG:

TODO

old:

$$Recall = \frac{N(Retrieved\_documents \cap Relevant\_documents)}{N(Relevant\_documents)}$$

fixed1:

$$Recall = \frac{N(\text{Retrieved\_documents} \cap \text{Relevant\_documents})}{N(\text{Relevant\_documents})}$$

fixed2:

$$Recall = \frac{N(Retrieved\text{\_}documents \cap Relevant\text{\_}documents)}{N(Relevant\text{\_}documents)}$$

alternative

$$Recall = \frac{N(\text{RetrievedDocuments} \cap \text{RelevantDocuments})}{N(\text{RelevantDocuments})}$$
$$Recall = \frac{N(Retrieved \cap Relevant)}{N(Relevant)}$$

Where

- $N(Retrieved\_documents \cap Relevant\_documents)$ is the number of documents that are both retrieved and relevant.
- $N(Relevant\_documents)$ is the total number of relevant documents in the database.
- $N(Retrieved \cap Relevant)$ is the number of documents that are both retrieved and relevant.
- $N(Relevant)$ is the total number of relevant documents in the database.

Good recall means that many documents relevant to the query were retrieved.

$$Precision = \frac{N(Retrieved\_documents \cap Relevant\_documents)}{N(Retrieved\_documents)}$$
$$Precision = \frac{N(Retrieved \cap Relevant)}{N(Retrieved)}$$

Where $N(Retrieved\_documents)$ is the total number of documents that were retrieved.

Expand All @@ -117,17 +101,37 @@ Naturally, an effective RAG maximizes both the recall and the precision. It's [b

### Practical concerns

Precision and recall are essential in understanding the information retrieval quality, they are quite hard to measure in practice. Unlike a cookbook, Pulumi registry contains thousands of ever changing documents, and evaluating how many of them are relevant for every user-submitted query is impractical making recall evaluation for live traffic next to impossible. Things a little easier with precision, where we're dealing with a small number of documents, but even that metric requires a non-trivial evaluation of relevance, which requires an LLM call or a human judge where the number of documents is small.
Precision and recall are essential in understanding the information retrieval quality, but they are quite hard to measure in practice. Unlike a cookbook, Pulumi registry contains thousands of ever changing documents, and evaluating how many of them are relevant for every user-submitted query is impractical making recall evaluation for live traffic next to impossible. Things a little easier with precision, where we're dealing with a small number of documents, but even that metric requires a non-trivial evaluation of relevance, which requires an LLM call or a human judge where the number of documents is small.

Fortunately, other metrics that often can effectively estimate retrieval quality have been developed. We have found a metric that can predict, with some degree of accuracy, whether the generated code will successfully compile. For this metric, we compare the _tokens_ present in the prompted produced by the LLM with the number of tokens present in the actually generated code. (By token here we understand a compiler token - an identifier such as the name of a class, method or a field and not a traditional LLM token concept),
Intuitively, if a token present in the prompt also appears in the generated program, we can assume that the token effectively contributed to the generated program. Tokens in the generated program that were not part of the prompt are not necessarily wrong but they are less trusted (they can come from the LLM built-in knowledge or were, ahem, hallucinated)

$$prompt \ coverage = \frac{N(Tokens\_in\_prompt \cap Tokens\_in\_code)}{N(Tokens\_in\_code)} $$
1:
$$prompt \ coverage = \frac{N(Tokens\_in\_prompt \cap Tokens\_in\_code)}{N(Tokens\_in\_code)}$$

2:
$$prompt \ coverage = \frac{N(\text{Tokens\_in\_prompt} \cap \text{Tokens\_in\_code})}{N(\text{Tokens\_in\_code})}$$

3:

$$prompt \ coverage = \frac{N(\text{Tokens_in_prompt} \cap \text{Tokens_in_code})}{N(\text{Tokens_in_code})}$$

4:
$$prompt \ coverage = \frac{N(\text{Tokens in prompt} \cap \text{Tokens in code})}{N(\text{Tokens in code})}$$

<!-- Note: our documents call is Recall, which is not how industry uses this term (see above) -->

Prompt coverage is a metric we can observe in production, and it's one of several metrics we use when updating providers to ensure we haven't regressed the quality of the RAG.

<div style="text-align: center; width: 50%; margin: 0 auto;">
<img src="flow.png" alt="" style="width: 100%;">
<figcaption>
<i>Flow of blah</i>
</figcaption>
</div>

more text...

<!--raw material
1.1. "generate code for S3 bucket" -> get search terms:
Expand Down

0 comments on commit 5268b2c

Please sign in to comment.