Skip to content

Commit

Permalink
Refactored autocommit.js to import getModelContextSize from count_tok…
Browse files Browse the repository at this point in the history
…ens.js and removed unused imports. Updated config.js to include modelName and increased maxTokens. Added count_tokens.js to calculate the number of tokens in a prompt. Updated package.json to include tiktoken as a dependency and bumped the version to 6.0.0.
  • Loading branch information
shanginn committed Jun 17, 2023
1 parent cd646de commit b7d57cb
Show file tree
Hide file tree
Showing 6 changed files with 129 additions and 37 deletions.
16 changes: 11 additions & 5 deletions autocommit.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,8 @@ import {
SystemMessagePromptTemplate
} from "langchain/prompts";
import defaultConfig from './config.js';
import {RecursiveCharacterTextSplitter} from "langchain/text_splitter";
import {loadSummarizationChain} from "langchain/chains";
import {ChatOpenAI} from "langchain/chat_models/openai";
import {OpenAI} from "langchain/llms/openai";
import fs from "fs";
import {getModelContextSize} from "./count_tokens.js";

const config = rc(
'git-aicommit',
Expand Down Expand Up @@ -88,8 +85,17 @@ const chatPrompt = ChatPromptTemplate.fromPromptMessages([
humanPromptTemplate,
]);

if (diff.length > 2000) {
const chatMessages = await chatPrompt.formatMessages({
diff: diff,
language: config.language,
});

const tokenCount = (await openai.getNumTokensFromMessages(chatMessages)).totalCount
const contextSize = getModelContextSize(config.modelName)

if (tokenCount > contextSize) {
console.log('Diff is too long. Splitting into multiple requests.')
// TODO: split smarter
const filenameRegex = /^a\/(.+?)\s+b\/(.+?)/;
const diffByFiles = diff
.split('diff ' + '--git ') // Wierd string concat in order to avoid splitting on this line when using autocommit in this repo :)
Expand Down
13 changes: 6 additions & 7 deletions config.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,14 @@ export default {
humanPromptTemplate: '' +
'Read the following git diff for a multiple files and ' +
'write 1-2 sentences commit message in {language}' +
'without mentioning lines or files:\n' +
'without mentioning lines or files.' +
'Explain why these changes were made (summarize the reasoning):\n' +
'{diff}',
excludeFromDiff: [
'*.lock', '*.lockb'
'*.lock', '*.lockb', '*-lock.json', '*-lock.yaml'
],
diffFilter: 'ACMRTUXB',
completionPromptParams: {
model: "gpt-3.5-turbo-16k",
temperature: 0.0,
maxTokens: 1000,
}
modelName: "gpt-3.5-turbo-16k",
temperature: 0.0,
maxTokens: 2000,
}
78 changes: 78 additions & 0 deletions count_tokens.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// langchain/dist/base_language/count_tokens.js
export const getModelNameForTiktoken = (modelName) => {
if (modelName.startsWith("gpt-3.5-turbo-16k")) {
return "gpt-3.5-turbo-16k";
}
if (modelName.startsWith("gpt-3.5-turbo-")) {
return "gpt-3.5-turbo";
}
if (modelName.startsWith("gpt-4-32k-")) {
return "gpt-4-32k";
}
if (modelName.startsWith("gpt-4-")) {
return "gpt-4";
}
return modelName;
};
export const getEmbeddingContextSize = (modelName) => {
switch (modelName) {
case "text-embedding-ada-002":
return 8191;
default:
return 2046;
}
};
export const getModelContextSize = (modelName) => {
switch (getModelNameForTiktoken(modelName)) {
case "gpt-3.5-turbo-16k":
return 16384;
case "gpt-3.5-turbo":
return 4096;
case "gpt-4-32k":
return 32768;
case "gpt-4":
return 8192;
case "text-davinci-003":
return 4097;
case "text-curie-001":
return 2048;
case "text-babbage-001":
return 2048;
case "text-ada-001":
return 2048;
case "code-davinci-002":
return 8000;
case "code-cushman-001":
return 2048;
default:
return 4097;
}
};
export const importTiktoken = async () => {
try {
const { encoding_for_model } = await import("@dqbd/tiktoken");
return { encoding_for_model };
}
catch (error) {
console.log(error);
return { encoding_for_model: null };
}
};
export const calculateMaxTokens = async ({ prompt, modelName, }) => {
const { encoding_for_model } = await importTiktoken();
// fallback to approximate calculation if tiktoken is not available
let numTokens = Math.ceil(prompt.length / 4);
try {
if (encoding_for_model) {
const encoding = encoding_for_model(getModelNameForTiktoken(modelName));
const tokenized = encoding.encode(prompt);
numTokens = tokenized.length;
encoding.free();
}
}
catch (error) {
console.warn("Failed to calculate number of tokens with tiktoken, falling back to approximate count", error);
}
const maxTokens = getModelContextSize(modelName);
return maxTokens - numTokens;
};
13 changes: 9 additions & 4 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
{
"name": "git-aicommit",
"version": "5.2.1",
"version": "6.0.0",
"description": "Generates auto commit messages with OpenAI GPT3 model",
"main": "autocommit.js",
"repository": "https://github.com/shanginn/autocommit",
"author": "[email protected]",
"license": "MIT",
"type": "module",
"dependencies": {
"@dqbd/tiktoken": "^1.0.7",
"langchain": "^0.0.75",
"openai": "^3.2.1",
"rc": "^1.2.8"
"openai": "^3.3.0",
"rc": "^1.2.8",
"tiktoken": "^1.0.8"
},
"preferGlobal": true,
"bin": {
Expand Down
38 changes: 21 additions & 17 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit b7d57cb

Please sign in to comment.