-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
200bf0c
commit ae32ce7
Showing
11 changed files
with
258 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Submodule FBGEMM
updated
from afc808 to cbce9f
Submodule pytorch
updated
from aaad8e to 0ff651
Submodule gitignore
updated
17 files
+11 −0 | Ballerina.gitignore | |
+1 −0 | CMake.gitignore | |
+12 −0 | Delphi.gitignore | |
+18 −0 | GitHubPages.gitignore | |
+2 −0 | Global/Archives.gitignore | |
+0 −4 | Global/Xcode.gitignore | |
+4 −0 | Go.gitignore | |
+47 −0 | IAR.gitignore | |
+0 −24 | Objective-C.gitignore | |
+3 −1 | Python.gitignore | |
+7 −0 | Rust.gitignore | |
+0 −28 | Swift.gitignore | |
+5 −0 | TeX.gitignore | |
+3 −0 | Terraform.gitignore | |
+2 −2 | UnrealEngine.gitignore | |
+44 −0 | community/Alteryx.gitignore | |
+11 −0 | community/UiPath.gitignore |
Submodule bitsandbytes
updated
from dada53 to 9e7537
Submodule torch-mlir
updated
54 files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Submodule Phind-CodeLlama-34B-v2
added at
949f61
Submodule subnet-llm
updated
16 files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
Idea: | ||
|
||
Take grammar of language, say the guile scheme or mes core language. | ||
Use only a fixed set of identifiers, say those in the core language, say mes. | ||
We can use the bootstrap code. | ||
Profile the execution of the compilation, capture the functions. | ||
Tokenize the language. | ||
Generate statements from the grammar. | ||
Generate invalid/valid statements. | ||
Look at the distribution of the encoding. | ||
|
||
Reduce the token count to only the needed ones. | ||
Autoencode the embedding, reduce the size. | ||
Autoencode the first layer, reduce the size. | ||
|
||
Finally we can then create a symbolic regression of the embedding and the first layer that will capture the core language. | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,198 @@ | ||
import re | ||
import torch | ||
from torch import nn | ||
from torch import optim | ||
from torch.utils.data import DataLoader, TensorDataset | ||
import numpy as np | ||
import pandas as pd | ||
import matplotlib.pyplot as plt | ||
from collections import defaultdict | ||
|
||
|
||
# Tokenization | ||
def tokenize_mes(code): | ||
"""Basic tokenization for MES""" | ||
tokens = re.findall(r'\(|\)|[^\s()]+', code) | ||
return tokens | ||
|
||
|
||
def create_vocabulary(all_tokens): | ||
""" Create vocabulary""" | ||
return sorted(set(all_tokens)) | ||
|
||
|
||
def tokens_to_indices(tokens, vocab): | ||
"""Convert tokens to indices""" | ||
return [vocab.index(token) for token in tokens] | ||
|
||
|
||
class EmbeddingAutoencoder(nn.Module): | ||
"""Autoencoder for embedding compression""" | ||
def __init__(self, vocab_size, embedding_dim, compressed_dim): | ||
super().__init__() | ||
self.embedding = nn.Embedding(vocab_size, embedding_dim) | ||
self.encoder = nn.Sequential( | ||
nn.Linear(embedding_dim, compressed_dim), | ||
nn.ReLU() | ||
) | ||
self.decoder = nn.Sequential( | ||
nn.Linear(compressed_dim, embedding_dim), | ||
nn.ReLU() | ||
) | ||
|
||
def forward(self, input_x): | ||
"""forward function""" | ||
embedded = self.embedding(input_x) | ||
encoded = self.encoder(embedded) | ||
decoded = self.decoder(encoded) | ||
return decoded | ||
|
||
|
||
def process_mes_code(mes_code, embedding_dim=16, | ||
compressed_dim=4, epochs=1000): | ||
"""Example usage""" | ||
# Tokenize | ||
tokens = tokenize_mes(mes_code) | ||
|
||
# Create vocabulary | ||
vocab = create_vocabulary(tokens) | ||
vocab_size = len(vocab) | ||
|
||
# Convert to indices | ||
indices = tokens_to_indices(tokens, vocab) | ||
|
||
# Prepare data for PyTorch | ||
data = torch.tensor(indices) | ||
dataset = TensorDataset(data) | ||
dataloader = DataLoader(dataset, batch_size=32, shuffle=True) | ||
|
||
# Initialize autoencoder | ||
model = EmbeddingAutoencoder(vocab_size, embedding_dim, compressed_dim) | ||
criterion = nn.MSELoss() | ||
optimizer = optim.Adam(model.parameters()) | ||
|
||
# Train | ||
for epoch in range(epochs): | ||
for batch in dataloader: | ||
inputs = batch[0] | ||
outputs = model(inputs) | ||
loss = criterion(outputs, model.embedding(inputs)) | ||
optimizer.zero_grad() | ||
loss.backward() | ||
optimizer.step() | ||
|
||
if (epoch + 1) % 10 == 0: | ||
print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}') | ||
|
||
return model, vocab | ||
|
||
|
||
def save_first_embedding(model, file_path): | ||
"""Save the weights of the first embedding layer.""" | ||
embedding_weights = model.embedding.weight.detach().cpu().numpy() | ||
np.save(file_path, embedding_weights) | ||
print(f"First embedding layer saved to {file_path}") | ||
|
||
|
||
def load_first_embedding(file_path): | ||
"""Load the saved weights of the first embedding layer.""" | ||
return np.load(file_path) | ||
|
||
|
||
def token_value_report(model, vocab): | ||
"""Produce a report on the value of each token.""" | ||
embedding_weights = model.embedding.weight.detach().cpu().numpy() | ||
|
||
# Calculate the L2 norm of each token's embedding | ||
token_norms = np.linalg.norm(embedding_weights, axis=1) | ||
|
||
# Calculate the cosine similarity between each pair of tokens | ||
similarity_matrix = embedding_weights @ embedding_weights.T | ||
norms = np.linalg.norm(embedding_weights, axis=1, keepdims=True) | ||
similarity_matrix /= norms | ||
similarity_matrix /= norms.T | ||
|
||
# Find the most similar tokens for each token | ||
most_similar = defaultdict(list) | ||
for i, token in enumerate(vocab): | ||
similarities = similarity_matrix[i] | ||
most_similar_indices = np.argsort(similarities)[-6:-1] | ||
# Top 5 similar tokens (excluding self) | ||
most_similar[token] = [(vocab[idx], similarities[idx]) for idx in | ||
most_similar_indices[::-1]] | ||
|
||
# Create a DataFrame with the results | ||
df = pd.DataFrame({ | ||
'Token': vocab, | ||
'Embedding Norm': token_norms, | ||
'Most Similar Tokens': [most_similar[token] for token in vocab] | ||
}) | ||
|
||
# Sort by embedding norm (you could change this to sort by a different | ||
# metric if desired) | ||
df = df.sort_values('Embedding Norm', ascending=False).reset_index( | ||
drop=True) | ||
|
||
return df | ||
|
||
|
||
def visualize_token_distribution(df): | ||
"""Visualize the distribution of token embedding norms.""" | ||
plt.figure(figsize=(10, 6)) | ||
plt.hist(df['Embedding Norm'], bins=30) | ||
plt.title('Distribution of Token Embedding Norms') | ||
plt.xlabel('Embedding Norm') | ||
plt.ylabel('Frequency') | ||
plt.savefig('token_norm_distribution.png') | ||
plt.close() | ||
|
||
|
||
def analyze_mes_embeddings(model, vocab, save_path='mes_first_embedding.npy'): | ||
# Save the first embedding layer | ||
save_first_embedding(model, save_path) | ||
|
||
# Produce token value report | ||
report = token_value_report(model, vocab) | ||
|
||
# Save report to CSV | ||
report.to_csv('token_value_report.csv', index=False) | ||
print("Token value report saved to token_value_report.csv") | ||
|
||
# Visualize token distribution | ||
visualize_token_distribution(report) | ||
print("Token norm distribution plot saved to token_norm_distribution.png") | ||
|
||
# Print top 10 tokens by embedding norm | ||
print("\nTop 10 tokens by embedding norm:") | ||
print(report[['Token', 'Embedding Norm']].head(10).to_string(index=False)) | ||
|
||
# Print example of similar tokens | ||
print("\nExample of similar tokens:") | ||
example_token = report['Token'].iloc[0] | ||
print(f"Tokens most similar to '{example_token}':") | ||
for token, similarity in report['Most Similar Tokens'].iloc[0]: | ||
print(f" {token}: {similarity:.4f}") | ||
|
||
|
||
def main(): | ||
"""Driver""" | ||
|
||
"""Example MES code (this is a simplified example)""" | ||
mes_code = """ | ||
(define (factorial n) | ||
(if (= n 0) | ||
1 | ||
(* n (factorial (- n 1))))) | ||
""" | ||
|
||
# Process the code | ||
mes_model, vocab = process_mes_code(mes_code) | ||
|
||
print("Vocabulary size:", len(vocab)) | ||
print("Compressed embedding size:", | ||
mes_model.encoder[0].out_features) | ||
|
||
analyze_mes_embeddings(mes_model, vocab) | ||
|
||
|
||
main() |