v1

meta-introspector · Jul 20, 2024 · ae32ce7 · ae32ce7
1 parent 200bf0c
commit ae32ce7
Show file tree

Hide file tree

Showing 11 changed files with 258 additions and 6 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -14231,3 +14231,6 @@
 [submodule "2024/07/19/petals_tensor_1"]
 	path = 2024/07/19/petals_tensor_1
 	url = https://github.com/hayotensor/petals_tensor_1
+[submodule "2024/07/19/Phind-CodeLlama-34B-v2"]
+	path = 2024/07/19/Phind-CodeLlama-34B-v2
+	url = https://huggingface.co/Phind/Phind-CodeLlama-34B-v2
diff --git a/2023/05/01/FBGEMM b/2023/05/01/FBGEMM
diff --git a/2023/07/17/experiments/pytorch b/2023/07/17/experiments/pytorch
diff --git a/2023/12/20/gitignore b/2023/12/20/gitignore
diff --git a/2024/02/29/bitsandbytes b/2024/02/29/bitsandbytes
diff --git a/2024/07/09/torch-mlir b/2024/07/09/torch-mlir
diff --git a/2024/07/17/notes.org b/2024/07/17/notes.org
@@ -84,3 +84,35 @@ will be more optimal so that cache lines will be more stable for different steps
 
 By creating these future contracts and paying miners for blocks of work with risk of losing a large amount
 for cheating, we can reduce the risk. 
+
+
+** idea 1
+
+1. Clients wants to process N requests at the best price, in a time period X.
+2. They would escrow that money to smart contract.
+3. miners would form squads to bid on that contract, submitting work samples.
+   they would bid on buying block N at price P and selling results for N+1 and price P +1.
+   if the miner is able to optimize that block and do it faster/cheaper they can bid less cost or time and win contract.
+4. client would pick the best squad based on past performance,
+   work samples, and price.
+5. each node would buy the inputs for processing, decrypt them, process them,
+   produce a smaller checksum, 1 float per layer,  and sell to next node, passing on the checksum.
+6. checksums will be aggregated, and published, with the hash of the input, inputhash + checksums
+7. if the client is not happy with the output, they can flag that transaction, and another squad can
+   re-run the process, and decrypt the input with clients permission.
+8. if there is a mismatch then you would get confirmations, and then the bad work would be flagged.
+   the confirmations would have to be done by a third party. 
+
+** idea truebit
+
+1. Clients wants to process N requests at fixed prices, in a time period X.
+2. they deposit gas fees into escrow.
+3. blockchain assigns job to squad randomly.
+4. squad outputs good and bad values.
+5. validator checks those outputs and say pass or fail, if fail goes to arbitrage
+6. arbitrage : squad has to validate the work, secret is revealed, the loser pays legal fees
+   both challegner and solver has to commit to computation steps. challenger checks subset of steps.
+   challenger has to prove the solver is wrong, (interactive zkp).
+   narrowing down to smaller and smaller problem (bisection).
+
+
diff --git a/2024/07/19/Phind-CodeLlama-34B-v2 b/2024/07/19/Phind-CodeLlama-34B-v2
diff --git a/2024/07/19/subnet-llm b/2024/07/19/subnet-llm
diff --git a/2024/07/20/notes.org b/2024/07/20/notes.org
@@ -0,0 +1,18 @@
+Idea:
+
+Take grammar of language, say the guile scheme or mes core language.
+Use only a fixed set of identifiers, say those in the core language, say mes.
+We can use the bootstrap code.
+Profile the execution of the compilation, capture the functions.
+Tokenize the language.
+Generate statements from the grammar.
+Generate invalid/valid statements.
+Look at the distribution of the encoding.
+
+Reduce the token count to only the needed ones.
+Autoencode the embedding, reduce the size.
+Autoencode the first layer, reduce the size.
+
+Finally we can then create a symbolic regression of the embedding and the first layer that will capture the core language.
+
+
diff --git a/2024/07/20/test.py b/2024/07/20/test.py
@@ -0,0 +1,198 @@
+import re
+import torch
+from torch import nn
+from torch import optim
+from torch.utils.data import DataLoader, TensorDataset
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from collections import defaultdict
+
+
+# Tokenization
+def tokenize_mes(code):
+    """Basic tokenization for MES"""
+    tokens = re.findall(r'\(|\)|[^\s()]+', code)
+    return tokens
+
+
+def create_vocabulary(all_tokens):
+    """ Create vocabulary"""
+    return sorted(set(all_tokens))
+
+
+def tokens_to_indices(tokens, vocab):
+    """Convert tokens to indices"""
+    return [vocab.index(token) for token in tokens]
+
+
+class EmbeddingAutoencoder(nn.Module):
+    """Autoencoder for embedding compression"""
+    def __init__(self, vocab_size, embedding_dim, compressed_dim):
+        super().__init__()
+        self.embedding = nn.Embedding(vocab_size, embedding_dim)
+        self.encoder = nn.Sequential(
+            nn.Linear(embedding_dim, compressed_dim),
+            nn.ReLU()
+        )
+        self.decoder = nn.Sequential(
+            nn.Linear(compressed_dim, embedding_dim),
+            nn.ReLU()
+        )
+
+    def forward(self, input_x):
+        """forward function"""
+        embedded = self.embedding(input_x)
+        encoded = self.encoder(embedded)
+        decoded = self.decoder(encoded)
+        return decoded
+
+
+def process_mes_code(mes_code, embedding_dim=16,
+                     compressed_dim=4, epochs=1000):
+    """Example usage"""
+    # Tokenize
+    tokens = tokenize_mes(mes_code)
+
+    # Create vocabulary
+    vocab = create_vocabulary(tokens)
+    vocab_size = len(vocab)
+
+    # Convert to indices
+    indices = tokens_to_indices(tokens, vocab)
+
+    # Prepare data for PyTorch
+    data = torch.tensor(indices)
+    dataset = TensorDataset(data)
+    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
+
+    # Initialize autoencoder
+    model = EmbeddingAutoencoder(vocab_size, embedding_dim, compressed_dim)
+    criterion = nn.MSELoss()
+    optimizer = optim.Adam(model.parameters())
+
+    # Train
+    for epoch in range(epochs):
+        for batch in dataloader:
+            inputs = batch[0]
+            outputs = model(inputs)
+            loss = criterion(outputs, model.embedding(inputs))
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+        if (epoch + 1) % 10 == 0:
+            print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')
+
+    return model, vocab
+
+
+def save_first_embedding(model, file_path):
+    """Save the weights of the first embedding layer."""
+    embedding_weights = model.embedding.weight.detach().cpu().numpy()
+    np.save(file_path, embedding_weights)
+    print(f"First embedding layer saved to {file_path}")
+
+
+def load_first_embedding(file_path):
+    """Load the saved weights of the first embedding layer."""
+    return np.load(file_path)
+
+
+def token_value_report(model, vocab):
+    """Produce a report on the value of each token."""
+    embedding_weights = model.embedding.weight.detach().cpu().numpy()
+
+    # Calculate the L2 norm of each token's embedding
+    token_norms = np.linalg.norm(embedding_weights, axis=1)
+
+    # Calculate the cosine similarity between each pair of tokens
+    similarity_matrix = embedding_weights @ embedding_weights.T
+    norms = np.linalg.norm(embedding_weights, axis=1, keepdims=True)
+    similarity_matrix /= norms
+    similarity_matrix /= norms.T
+
+    # Find the most similar tokens for each token
+    most_similar = defaultdict(list)
+    for i, token in enumerate(vocab):
+        similarities = similarity_matrix[i]
+        most_similar_indices = np.argsort(similarities)[-6:-1]
+        # Top 5 similar tokens (excluding self)
+        most_similar[token] = [(vocab[idx], similarities[idx]) for idx in
+                               most_similar_indices[::-1]]
+
+    # Create a DataFrame with the results
+    df = pd.DataFrame({
+        'Token': vocab,
+        'Embedding Norm': token_norms,
+        'Most Similar Tokens': [most_similar[token] for token in vocab]
+    })
+
+    # Sort by embedding norm (you could change this to sort by a different
+    # metric if desired)
+    df = df.sort_values('Embedding Norm', ascending=False).reset_index(
+        drop=True)
+
+    return df
+
+
+def visualize_token_distribution(df):
+    """Visualize the distribution of token embedding norms."""
+    plt.figure(figsize=(10, 6))
+    plt.hist(df['Embedding Norm'], bins=30)
+    plt.title('Distribution of Token Embedding Norms')
+    plt.xlabel('Embedding Norm')
+    plt.ylabel('Frequency')
+    plt.savefig('token_norm_distribution.png')
+    plt.close()
+
+
+def analyze_mes_embeddings(model, vocab, save_path='mes_first_embedding.npy'):
+    # Save the first embedding layer
+    save_first_embedding(model, save_path)
+
+    # Produce token value report
+    report = token_value_report(model, vocab)
+
+    # Save report to CSV
+    report.to_csv('token_value_report.csv', index=False)
+    print("Token value report saved to token_value_report.csv")
+
+    # Visualize token distribution
+    visualize_token_distribution(report)
+    print("Token norm distribution plot saved to token_norm_distribution.png")
+
+    # Print top 10 tokens by embedding norm
+    print("\nTop 10 tokens by embedding norm:")
+    print(report[['Token', 'Embedding Norm']].head(10).to_string(index=False))
+
+    # Print example of similar tokens
+    print("\nExample of similar tokens:")
+    example_token = report['Token'].iloc[0]
+    print(f"Tokens most similar to '{example_token}':")
+    for token, similarity in report['Most Similar Tokens'].iloc[0]:
+        print(f"  {token}: {similarity:.4f}")
+
+
+def main():
+    """Driver"""
+
+    """Example MES code (this is a simplified example)"""
+    mes_code = """
+    (define (factorial n)
+    (if (= n 0)
+    1
+    (* n (factorial (- n 1)))))
+    """
+
+    # Process the code
+    mes_model, vocab = process_mes_code(mes_code)
+
+    print("Vocabulary size:", len(vocab))
+    print("Compressed embedding size:",
+          mes_model.encoder[0].out_features)
+
+    analyze_mes_embeddings(mes_model, vocab)
+
+
+main()
+11 −0		Ballerina.gitignore
+1 −0		CMake.gitignore
+12 −0		Delphi.gitignore
+18 −0		GitHubPages.gitignore
+2 −0		Global/Archives.gitignore
+0 −4		Global/Xcode.gitignore
+4 −0		Go.gitignore
+47 −0		IAR.gitignore
+0 −24		Objective-C.gitignore
+3 −1		Python.gitignore
+7 −0		Rust.gitignore
+0 −28		Swift.gitignore
+5 −0		TeX.gitignore
+3 −0		Terraform.gitignore
+2 −2		UnrealEngine.gitignore
+44 −0		community/Alteryx.gitignore
+11 −0		community/UiPath.gitignore
+4 −4		.github/workflows/RollPyTorch.yml
+1 −4		CMakeLists.txt
+2 −2		build_tools/python_deploy/build_linux_packages.sh
+1 −1		build_tools/python_deploy/build_windows.ps1
+7 −0		include/torch-mlir/Conversion/TorchOnnxToTorch/Utils.h
+1 −1		include/torch-mlir/Conversion/TorchToStablehlo/StablehloLegalizeUtils.h
+131 −0		include/torch-mlir/Dialect/Torch/IR/GeneratedTorchOps.td
+1 −1		lib/Conversion/TorchOnnxToTorch/CMakeLists.txt
+4 −3		lib/Conversion/TorchOnnxToTorch/DefaultDomainAtoF.cpp
+200 −77		lib/Conversion/TorchOnnxToTorch/DefaultDomainGtoP.cpp
+85 −0		lib/Conversion/TorchOnnxToTorch/DefaultDomainQtoZ.cpp
+0 −514		lib/Conversion/TorchOnnxToTorch/OnnxLstmExpander.cpp
+1,258 −0		lib/Conversion/TorchOnnxToTorch/OnnxRecurrentLayerOpExpanders.cpp
+12 −0		lib/Conversion/TorchOnnxToTorch/Utils.cpp
+329 −16		lib/Conversion/TorchToLinalg/Random.cpp
+64 −44		lib/Conversion/TorchToLinalg/TensorConstructors.cpp
+78 −65		lib/Conversion/TorchToStablehlo/Basic.cpp
+2 −1		lib/Conversion/TorchToStablehlo/Linear.cpp
+6 −4		lib/Conversion/TorchToStablehlo/Pooling.cpp
+4 −7		lib/Conversion/TorchToStablehlo/StablehloLegalizeUtils.cpp
+25 −6		lib/Conversion/TorchToTMTensor/TorchToTMTensor.cpp
+37 −0		lib/Dialect/Torch/IR/TorchOps.cpp
+311 −0		lib/Dialect/Torch/Transforms/AbstractInterpLibrary.cpp
+547 −44		lib/Dialect/Torch/Transforms/DecomposeComplexOps.cpp
+48 −3		lib/Dialect/Torch/Transforms/FuseQuantizedOps.cpp
+5 −0		lib/Dialect/Torch/Transforms/LowerToBackendContract.cpp
+7 −0		lib/Dialect/TorchConversion/Transforms/Passes.cpp
+2 −0		lib/InitAll.cpp
+33 −12		projects/pt1/e2e_testing/xfail_sets.py
+118 −10		projects/pt1/python/torch_mlir/jit_ir_importer/build_tools/abstract_interp_lib_gen.py
+11 −0		projects/pt1/python/torch_mlir/jit_ir_importer/build_tools/torch_ods_gen.py
+1 −0		projects/pt1/python/torch_mlir_e2e_test/test_suite/__init__.py
+106 −4		projects/pt1/python/torch_mlir_e2e_test/test_suite/elementwise.py
+42 −0		projects/pt1/python/torch_mlir_e2e_test/test_suite/linalg_algorithms.py
+25 −0		projects/pt1/python/torch_mlir_e2e_test/test_suite/norm_like.py
+43 −0		projects/pt1/python/torch_mlir_e2e_test/test_suite/reshape_like.py
+47 −0		projects/pt1/python/torch_mlir_e2e_test/test_suite/rng.py
+25 −0		projects/pt1/python/torch_mlir_e2e_test/test_suite/scatter.py
+53 −0		projects/pt1/python/torch_mlir_e2e_test/test_suite/spectral.py
+5 −1		python/CMakeLists.txt
+8 −7		python/torch_mlir/compiler_utils.py
+59 −23		python/torch_mlir/extras/fx_importer.py
+3 −3		python/torch_mlir/fx.py
+1 −1		pytorch-hash.txt
+2 −2		pytorch-requirements.txt
+32 −0		test/Conversion/TorchOnnxToTorch/simple_ops_a_to_f.mlir
+116 −25		test/Conversion/TorchOnnxToTorch/simple_ops_g_to_p.mlir
+118 −0		test/Conversion/TorchOnnxToTorch/simple_ops_q_to_z.mlir
+42 −0		test/Dialect/Torch/fuse-quantized-ops.mlir
+2 −2		test/python/fx_importer/basic_test.py
+48 −1		test/python/fx_importer/custom_op_test.py
+51 −16		test/python/fx_importer/symbolic_shape_expr_test.py
+1 −1		test/python/fx_importer/v2.3/types_test.py
+2 −2		torchvision-requirements.txt
+1 −1		.gitignore
+1 −1		README.md
+ −		src/petals_tensor/health/__pycache__/config.cpython-310.pyc
+ −		src/petals_tensor/health/__pycache__/data_structures.cpython-310.pyc
+ −		src/petals_tensor/health/__pycache__/health_v2.cpython-310.pyc
+ −		src/petals_tensor/health/__pycache__/health_v2_test.cpython-310.pyc
+ −		src/petals_tensor/health/__pycache__/p2p_utils.cpython-310.pyc
+0 −38		src/petals_tensor/server/inference_validator.py
+0 −83		src/petals_tensor/server/server.py
+ −		src/petals_tensor/substrate/__pycache__/chain_data.cpython-310.pyc
+ −		src/petals_tensor/substrate/__pycache__/chain_functions.cpython-310.pyc
+ −		src/petals_tensor/substrate/__pycache__/config.cpython-310.pyc
+ −		src/petals_tensor/substrate/__pycache__/p2p_utils.cpython-310.pyc
+ −		src/petals_tensor/substrate/__pycache__/submit_consensus.cpython-310.pyc
+ −		src/petals_tensor/substrate/__pycache__/test_data.cpython-310.pyc
+ −		src/petals_tensor/substrate/__pycache__/utils.cpython-310.pyc