Refactored autocommit.js to import getModelContextSize from count_tok…

…ens.js and removed unused imports. Updated config.js to include modelName and increased maxTokens. Added count_tokens.js to calculate the number of tokens in a prompt. Updated package.json to include tiktoken as a dependency and bumped the version to 6.0.0.
shanginn · Jun 17, 2023 · b7d57cb · b7d57cb
1 parent cd646de
commit b7d57cb
Show file tree

Hide file tree

Showing 6 changed files with 129 additions and 37 deletions.
diff --git a/autocommit.js b/autocommit.js
@@ -9,11 +9,8 @@ import {
     SystemMessagePromptTemplate
 } from "langchain/prompts";
 import defaultConfig from './config.js';
-import {RecursiveCharacterTextSplitter} from "langchain/text_splitter";
-import {loadSummarizationChain} from "langchain/chains";
 import {ChatOpenAI} from "langchain/chat_models/openai";
-import {OpenAI} from "langchain/llms/openai";
-import fs from "fs";
+import {getModelContextSize} from "./count_tokens.js";
 
 const config = rc(
     'git-aicommit',
@@ -88,8 +85,17 @@ const chatPrompt = ChatPromptTemplate.fromPromptMessages([
     humanPromptTemplate,
 ]);
 
-if (diff.length > 2000) {
+const chatMessages = await chatPrompt.formatMessages({
+    diff: diff,
+    language: config.language,
+});
+
+const tokenCount = (await openai.getNumTokensFromMessages(chatMessages)).totalCount
+const contextSize = getModelContextSize(config.modelName)
+
+if (tokenCount > contextSize) {
     console.log('Diff is too long. Splitting into multiple requests.')
+    // TODO: split smarter
     const filenameRegex = /^a\/(.+?)\s+b\/(.+?)/;
     const diffByFiles = diff
         .split('diff ' + '--git ') // Wierd string concat in order to avoid splitting on this line when using autocommit in this repo :)

diff --git a/config.js b/config.js
@@ -15,15 +15,14 @@ export default {
     humanPromptTemplate: '' +
         'Read the following git diff for a multiple files and ' +
         'write 1-2 sentences commit message in {language}' +
-        'without mentioning lines or files:\n' +
+        'without mentioning lines or files.' +
+        'Explain why these changes were made (summarize the reasoning):\n' +
         '{diff}',
     excludeFromDiff: [
-        '*.lock', '*.lockb'
+        '*.lock', '*.lockb', '*-lock.json', '*-lock.yaml'
     ],
     diffFilter: 'ACMRTUXB',
-    completionPromptParams: {
-        model: "gpt-3.5-turbo-16k",
-        temperature: 0.0,
-        maxTokens: 1000,
-    }
+    modelName: "gpt-3.5-turbo-16k",
+    temperature: 0.0,
+    maxTokens: 2000,
 }
diff --git a/count_tokens.js b/count_tokens.js
@@ -0,0 +1,78 @@
+// langchain/dist/base_language/count_tokens.js
+export const getModelNameForTiktoken = (modelName) => {
+    if (modelName.startsWith("gpt-3.5-turbo-16k")) {
+        return "gpt-3.5-turbo-16k";
+    }
+    if (modelName.startsWith("gpt-3.5-turbo-")) {
+        return "gpt-3.5-turbo";
+    }
+    if (modelName.startsWith("gpt-4-32k-")) {
+        return "gpt-4-32k";
+    }
+    if (modelName.startsWith("gpt-4-")) {
+        return "gpt-4";
+    }
+    return modelName;
+};
+export const getEmbeddingContextSize = (modelName) => {
+    switch (modelName) {
+        case "text-embedding-ada-002":
+            return 8191;
+        default:
+            return 2046;
+    }
+};
+export const getModelContextSize = (modelName) => {
+    switch (getModelNameForTiktoken(modelName)) {
+        case "gpt-3.5-turbo-16k":
+            return 16384;
+        case "gpt-3.5-turbo":
+            return 4096;
+        case "gpt-4-32k":
+            return 32768;
+        case "gpt-4":
+            return 8192;
+        case "text-davinci-003":
+            return 4097;
+        case "text-curie-001":
+            return 2048;
+        case "text-babbage-001":
+            return 2048;
+        case "text-ada-001":
+            return 2048;
+        case "code-davinci-002":
+            return 8000;
+        case "code-cushman-001":
+            return 2048;
+        default:
+            return 4097;
+    }
+};
+export const importTiktoken = async () => {
+    try {
+        const { encoding_for_model } = await import("@dqbd/tiktoken");
+        return { encoding_for_model };
+    }
+    catch (error) {
+        console.log(error);
+        return { encoding_for_model: null };
+    }
+};
+export const calculateMaxTokens = async ({ prompt, modelName, }) => {
+    const { encoding_for_model } = await importTiktoken();
+    // fallback to approximate calculation if tiktoken is not available
+    let numTokens = Math.ceil(prompt.length / 4);
+    try {
+        if (encoding_for_model) {
+            const encoding = encoding_for_model(getModelNameForTiktoken(modelName));
+            const tokenized = encoding.encode(prompt);
+            numTokens = tokenized.length;
+            encoding.free();
+        }
+    }
+    catch (error) {
+        console.warn("Failed to calculate number of tokens with tiktoken, falling back to approximate count", error);
+    }
+    const maxTokens = getModelContextSize(modelName);
+    return maxTokens - numTokens;
+};
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,17 +1,17 @@
 {
   "name": "git-aicommit",
-  "version": "5.2.1",
+  "version": "6.0.0",
   "description": "Generates auto commit messages with OpenAI GPT3 model",
   "main": "autocommit.js",
   "repository": "https://github.com/shanginn/autocommit",
   "author": "[email protected]",
   "license": "MIT",
   "type": "module",
   "dependencies": {
-    "@dqbd/tiktoken": "^1.0.7",
     "langchain": "^0.0.75",
-    "openai": "^3.2.1",
-    "rc": "^1.2.8"
+    "openai": "^3.3.0",
+    "rc": "^1.2.8",
+    "tiktoken": "^1.0.8"
   },
   "preferGlobal": true,
   "bin": {

diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml