Skip to content

Commit

Permalink
feat: add new packages / improve json loader
Browse files Browse the repository at this point in the history
  • Loading branch information
zhzLuke96 committed Nov 24, 2024
1 parent 41ad928 commit 8b1e9b2
Show file tree
Hide file tree
Showing 5 changed files with 139 additions and 20 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
},
"dependencies": {
"@lenml/tokenizers": "^3.0.3",
"@quik-fe/stand": "^1.1.2",
"@quik-fe/stand": "^1.1.10",
"@types/draft-js": "^0.11.18",
"@types/styled-components": "^5.1.34",
"classnames": "^2.5.1",
Expand Down
14 changes: 7 additions & 7 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

41 changes: 34 additions & 7 deletions src/TokenizersHub.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ type PreTrainedTokenizer = tokenizers.PreTrainedTokenizer;
// NOTE: 这里的 package version 并不是 tokenizers 的 version,而是打包版本的 version,并不一定和 `tokenizers` 版本一致
const package_version = "3.0.1";
export const packages = [
"gemma2",
"qwen2_5",
"aya_expanse",
"llama3_2",
"mistral_nemo",
"gemini",
"llama3_1",
"llama2",
"llama3",
Expand All @@ -28,10 +34,12 @@ export const packages = [
"text_davinci002",
"text_davinci003",
"text_embedding_ada002",
].map((x) => ({
name: x,
url: `https://cdn.jsdelivr.net/npm/@lenml/tokenizer-${x}@${package_version}/+esm`,
}));
]
.sort()
.map((x) => ({
name: x,
url: `https://cdn.jsdelivr.net/npm/@lenml/tokenizer-${x}@${package_version}/+esm`,
}));

// 从这里取得所有的 Tokenizers
// 可以来自 package 或者 url
Expand Down Expand Up @@ -87,13 +95,32 @@ export class TokenizersHub {
return this.pkg_registry[name];
}

private async fetchJson(url: string) {
const content = await fetch(url);
if (content.status !== 200) {
throw new Error(
`fetch ${url} failed: ${content.status} ${
content.statusText
} \n${await content.text()}`.trim()
);
}
try {
return await content.json();
} catch (error) {
console.error(error);
throw new Error(
`parse data ${url} error: ${(error as any)?.message || error}`
);
}
}

private async loadFromUrl(
json_url: string,
config_url: string
): Promise<PreTrainedTokenizer> {
const tokenizer = await TokenizerLoader.fromPreTrainedUrls({
tokenizerJSON: json_url,
tokenizerConfig: config_url,
const tokenizer = await TokenizerLoader.fromPreTrained({
tokenizerJSON: await this.fetchJson(json_url),
tokenizerConfig: await this.fetchJson(config_url),
});

return tokenizer;
Expand Down
76 changes: 73 additions & 3 deletions src/components/TokenizerPanel/TokenizerConfigure.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,51 @@ const TokenizerFromUrls = ({ onConfigChange }: Props) => {
);
};

const TokenizerFromHuggingfaceRepo = ({ onConfigChange }: Props) => {
const [formData, setFormData] = useState({
repo: "",
});
return (
<TokenizerFromUrlsBox>
<div>
<label>
Repo Name:
<input
type="text"
onChange={(e) => {
const repo = e.target.value ?? "";
setFormData((prev) => ({
...prev,
repo,
}));
}}
placeholder="eg. meta-llama/Llama-3.2-1B"
/>
</label>
</div>
<div>
<button
disabled={!formData.repo}
onClick={() => {
if (!formData.repo.match(/^[a-zA-Z0-9_-]+\/[a-zA-Z0-9_-]+$/)) {
alert("Invalid repo name");
return;
}
const { repo } = formData;
onConfigChange({
type: "url",
json_url: `https://huggingface.co/${repo}/resolve/main/tokenizer.json?download=true`,
config_url: `https://huggingface.co/${repo}/resolve/main/tokenizer_config.json?download=true`,
});
}}
>
Load
</button>
</div>
</TokenizerFromUrlsBox>
);
};

const TokenizerFromPackagesBox = styled.div`
display: flex;
Expand All @@ -89,8 +134,22 @@ const TokenizerFromPackagesBox = styled.div`
button {
padding: 4px;
font-size: 16px;
cursor: pointer;
background-color: transparent;
border: none;
border-radius: 4px;
&:hover {
background-color: rgba(255, 255, 255, 0.2);
}
&:active {
background-color: rgba(255, 255, 255, 0.4);
}
}
}
p {
opacity: 0.75;
}
`;

const TokenizerFromPackages = ({ onConfigChange }: Props) => {
Expand Down Expand Up @@ -140,8 +199,10 @@ const TokenizerConfigureBox = styled.div`
border-bottom: 1px solid transparent;
cursor: pointer;
opacity: 0.75;
&.--selected {
opacity: 1;
border-bottom-color: white;
color: white;
}
Expand All @@ -151,28 +212,37 @@ const TokenizerConfigureBox = styled.div`

// 选择 tokenizer 配置
export const TokenizerConfigure = ({ onConfigChange }: Props) => {
const [mode, setMode] = useState<"urls" | "packages">("packages");
const [mode, setMode] = useState<"urls" | "packages" | "repo">("packages");

return (
<TokenizerConfigureBox>
<div className="configure-tabs">
<button
className={classNames({ "--selected": mode === "repo" })}
onClick={() => setMode("repo")}
>
🤗 From Huggingface Repo
</button>
<button
className={classNames({ "--selected": mode === "urls" })}
onClick={() => setMode("urls")}
>
From URLs
🌐 From URLs
</button>
<button
className={classNames({ "--selected": mode === "packages" })}
onClick={() => setMode("packages")}
>
From Packages
📦 From Packages
</button>
</div>
{mode === "urls" && <TokenizerFromUrls onConfigChange={onConfigChange} />}
{mode === "packages" && (
<TokenizerFromPackages onConfigChange={onConfigChange} />
)}
{mode === "repo" && (
<TokenizerFromHuggingfaceRepo onConfigChange={onConfigChange} />
)}
</TokenizerConfigureBox>
);
};
26 changes: 24 additions & 2 deletions src/components/TokenizerPanel/TokenizerPanel.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -206,18 +206,24 @@ const useSelectionTokenRange = ({

const useTokenizer = (config: TokenizerDefine) => {
const [tokenizer, setTokenizer] = useState<null | PreTrainedTokenizer>(null);
const [error, setError] = useState<null | string>(null);

useEffect(() => {
const loadTokenizer = async () => {
setError(null);
const tokenizer = await TokenizersHub.instance.get(config);
// NOTE: 因为 tokenizer is callable object
setTokenizer(() => tokenizer);
};
loadTokenizer();
loadTokenizer().catch((err) => {
console.error(err);
setError(err?.message || err);
});
}, [config]);

return {
tokenizer,
error,
};
};

Expand Down Expand Up @@ -351,7 +357,7 @@ export function TokenizerPanel({
inputValue?: string | null;
getInitValue?: () => string | undefined | null;
}) {
const { tokenizer } = useTokenizer(config);
const { tokenizer, error } = useTokenizer(config);

const [value, setValue] = useState(
() => getInitValue?.() ?? "Potato potato tomato potato."
Expand Down Expand Up @@ -391,6 +397,22 @@ export function TokenizerPanel({
onChange?.(value);
};

if (error) {
return (
<Body>
<pre
style={{
color: "red",
whiteSpace: "pre-wrap",
wordBreak: "break-all",
}}
>
{error}
</pre>
</Body>
);
}

if (!tokenizer) {
return (
<Body>
Expand Down

0 comments on commit 8b1e9b2

Please sign in to comment.