Implemented vggish

Also cleaned up some stuff in the other embedding scripts and RF script
CV4EcologySchool · Aug 30, 2023 · 39ad7a8 · 39ad7a8
1 parent 08e0d72
commit 39ad7a8
Show file tree

Hide file tree

Showing 16 changed files with 1,053 additions and 308 deletions.
diff --git a/code/simclr-pytorch-reefs/evaluation/embeddings/ImageNet_embedding_extractor.ipynb b/code/simclr-pytorch-reefs/evaluation/embeddings/ImageNet_embedding_extractor.ipynb
diff --git a/code/simclr-pytorch-reefs/evaluation/embeddings/ReefCLR_embedding_extractor.ipynb b/code/simclr-pytorch-reefs/evaluation/embeddings/ReefCLR_embedding_extractor.ipynb
diff --git a/code/simclr-pytorch-reefs/evaluation/embeddings/YAMNet_embedding_extractor.ipynb b/code/simclr-pytorch-reefs/evaluation/embeddings/YAMNet_embedding_extractor.ipynb
@@ -1,44 +1,111 @@
 {
  "cells": [
   {
-   "cell_type": "code",
-   "execution_count": 1,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "# import torch\n",
-    "# import torch.nn as nn\n",
-    "# import torchvision.models as models\n",
-    "# from torch.utils.data import DataLoader"
+    "# VGGish\n",
+    "\n",
+    "Script to extract embeddings from audio using VGGish. \n",
+    "\n",
+    "Note this is far slower than the other embedding scripts as its not using the gpu."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-08-30 21:28:37.542298: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2023-08-30 21:28:42.233539: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
+     ]
+    }
+   ],
    "source": [
+    "import tensorflow as tf\n",
+    "import tensorflow_hub as hub\n",
+    "import numpy as np\n",
+    "import csv\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "from IPython.display import Audio\n",
+    "from scipy.io import wavfile\n",
+    "\n",
     "# Importing necessary modules\n",
     "import json\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-08-30 21:28:51.360074: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.\n",
+      "Skipping registering GPU devices...\n"
+     ]
+    }
+   ],
+   "source": [
+    "# load VGGish\n",
+    "model = hub.load('https://tfhub.dev/google/vggish/1')\n",
+    "\n",
+    "### needs this placeholder for some reason\n",
+    "# Input: 3 seconds of silence as mono 16 kHz waveform samples.\n",
+    "waveform = np.zeros(3 * 16000, dtype=np.float32)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# which dataset to use\n",
+    "test_dataset = 'test_bermuda'\n",
     "\n",
-    "# Load the JSON file\n",
+    "# path where json file of data is stored\n",
     "json_path = '/home/ben/reef-audio-representation-learning/data/dataset.json'\n",
-    "with open(json_path, 'r') as f:\n",
-    "    dataset_json = json.load(f)"
+    "\n",
+    "# path to the audio files\n",
+    "dataset_path = '/home/ben/data/full_dataset/'\n",
+    "\n",
+    "# path to the results folder, where the csv if embeddings will be saved\n",
+    "results_path = '/home/ben/reef-audio-representation-learning/code/simclr-pytorch-reefs/evaluation/embeddings/raw_embeddings/'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Find the right data"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# open the json\n",
+    "with open(json_path, 'r') as f:\n",
+    "    dataset_json = json.load(f)\n",
+    "    \n",
     "# Initialize an empty list to store the filtered entries\n",
     "filtered_entries = []\n",
     "\n",
     "# Filter entries based on 'data_type' and 'dataset'\n",
     "for entry in dataset_json['audio']:\n",
-    "    if entry['data_type'] == 'test_data' and entry['dataset'] == 'test_australia':\n",
+    "    if entry['data_type'] == 'test_data' and entry['dataset'] == test_dataset:\n",
     "        # Convert the 'class' to numeric\n",
     "        numeric_class = int(entry['class'].replace('class', ''))\n",
     "        \n",
@@ -49,15 +116,254 @@
     "        }\n",
     "        \n",
     "        # Append the filtered entry to the list\n",
-    "        filtered_entries.append(filtered_entry)"
+    "        filtered_entries.append(filtered_entry) #list objest with dictionaries of {file_name: file, class}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Get embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def ensure_sample_rate(original_sample_rate, waveform,\n",
+    "                       desired_sample_rate=16000):\n",
+    "  \"\"\"Resample waveform if required.\"\"\"\n",
+    "  if original_sample_rate != desired_sample_rate:\n",
+    "    desired_length = int(round(float(len(waveform)) /\n",
+    "                               original_sample_rate * desired_sample_rate))\n",
+    "    waveform = scipy.signal.resample(waveform, desired_length)\n",
+    "  return desired_sample_rate, waveform"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize an empty list to store the embeddings\n",
+    "all_embeddings = []\n",
+    "\n",
+    "# Initialize an empty list to store the rows for DataFrame\n",
+    "df_rows = []\n",
+    "\n",
+    "# Loop through each filtered entry to read and process the WAV file\n",
+    "for entry in filtered_entries:\n",
+    "    wav_file_name = dataset_path + entry['file_name']\n",
+    "    \n",
+    "    # Read the WAV file\n",
+    "    sample_rate, wav_data = wavfile.read(wav_file_name, 'rb')\n",
+    "    \n",
+    "    # Ensure sample rate\n",
+    "    sample_rate, wav_data = ensure_sample_rate(sample_rate, wav_data)\n",
+    "    \n",
+    "    # Pad wav_data with 280 extra zeros\n",
+    "    wav_data = np.pad(wav_data, (0, 280), 'constant')\n",
+    "    \n",
+    "    # Compute the embeddings\n",
+    "    embeddings = model(wav_data)\n",
+    "    \n",
+    "    # Assert the shape of the embeddings\n",
+    "    embeddings.shape.assert_is_compatible_with([None, 128])\n",
+    "\n",
+    "    # convert embeddings to a numpy array\n",
+    "    second_1 = np.array(embeddings[0])\n",
+    "    second_2 = np.array(embeddings[1])\n",
+    "\n",
+    "    # take mean of the array for each 1sec, so we average features over the 2 seconds\n",
+    "    mean = np.mean([second_1, second_2], axis=0)\n",
+    "    \n",
+    "    # Create a row for DataFrame\n",
+    "    df_row = {'label': entry['class']}\n",
+    "    for i, feature in enumerate(mean):  # Assuming embeddings[0] contains the 128 features\n",
+    "        df_row[f'Feature_{i+1}'] = feature\n",
+    "    \n",
+    "    df_rows.append(df_row)\n",
+    "\n",
+    "# Create a DataFrame\n",
+    "df = pd.DataFrame(df_rows)\n",
+    "\n",
+    "# Save the DataFrame to a CSV file\n",
+    "df.to_csv(results_path + 'VGGish-' + test_dataset[5:] + '-embeddings.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>label</th>\n",
+       "      <th>Feature_1</th>\n",
+       "      <th>Feature_2</th>\n",
+       "      <th>Feature_3</th>\n",
+       "      <th>Feature_4</th>\n",
+       "      <th>Feature_5</th>\n",
+       "      <th>Feature_6</th>\n",
+       "      <th>Feature_7</th>\n",
+       "      <th>Feature_8</th>\n",
+       "      <th>Feature_9</th>\n",
+       "      <th>...</th>\n",
+       "      <th>Feature_119</th>\n",
+       "      <th>Feature_120</th>\n",
+       "      <th>Feature_121</th>\n",
+       "      <th>Feature_122</th>\n",
+       "      <th>Feature_123</th>\n",
+       "      <th>Feature_124</th>\n",
+       "      <th>Feature_125</th>\n",
+       "      <th>Feature_126</th>\n",
+       "      <th>Feature_127</th>\n",
+       "      <th>Feature_128</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>-0.755880</td>\n",
+       "      <td>-0.239144</td>\n",
+       "      <td>-0.006482</td>\n",
+       "      <td>-0.660316</td>\n",
+       "      <td>-0.661326</td>\n",
+       "      <td>-1.564038</td>\n",
+       "      <td>0.189483</td>\n",
+       "      <td>-0.150790</td>\n",
+       "      <td>-2.337072</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-0.306457</td>\n",
+       "      <td>0.085061</td>\n",
+       "      <td>-0.065240</td>\n",
+       "      <td>-0.174579</td>\n",
+       "      <td>-0.748717</td>\n",
+       "      <td>-0.202958</td>\n",
+       "      <td>-0.170341</td>\n",
+       "      <td>-0.619031</td>\n",
+       "      <td>0.144040</td>\n",
+       "      <td>0.159795</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0</td>\n",
+       "      <td>-0.569910</td>\n",
+       "      <td>-0.196253</td>\n",
+       "      <td>-0.012757</td>\n",
+       "      <td>-0.733111</td>\n",
+       "      <td>-0.702112</td>\n",
+       "      <td>-1.603721</td>\n",
+       "      <td>0.293776</td>\n",
+       "      <td>-0.188705</td>\n",
+       "      <td>-2.214564</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-0.384656</td>\n",
+       "      <td>0.058515</td>\n",
+       "      <td>-0.087278</td>\n",
+       "      <td>-0.202737</td>\n",
+       "      <td>-0.680734</td>\n",
+       "      <td>-0.189267</td>\n",
+       "      <td>-0.165939</td>\n",
+       "      <td>-0.563902</td>\n",
+       "      <td>0.084017</td>\n",
+       "      <td>0.065772</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0</td>\n",
+       "      <td>-0.767339</td>\n",
+       "      <td>-0.215024</td>\n",
+       "      <td>0.117208</td>\n",
+       "      <td>-0.570487</td>\n",
+       "      <td>-0.628667</td>\n",
+       "      <td>-1.538399</td>\n",
+       "      <td>0.244541</td>\n",
+       "      <td>-0.060223</td>\n",
+       "      <td>-2.132523</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-0.199617</td>\n",
+       "      <td>0.119985</td>\n",
+       "      <td>-0.073416</td>\n",
+       "      <td>-0.218369</td>\n",
+       "      <td>-0.632460</td>\n",
+       "      <td>-0.165810</td>\n",
+       "      <td>-0.144961</td>\n",
+       "      <td>-0.630340</td>\n",
+       "      <td>0.159019</td>\n",
+       "      <td>0.107950</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>3 rows × 129 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   label  Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  Feature_6  \\\n",
+       "0      0  -0.755880  -0.239144  -0.006482  -0.660316  -0.661326  -1.564038   \n",
+       "1      0  -0.569910  -0.196253  -0.012757  -0.733111  -0.702112  -1.603721   \n",
+       "2      0  -0.767339  -0.215024   0.117208  -0.570487  -0.628667  -1.538399   \n",
+       "\n",
+       "   Feature_7  Feature_8  Feature_9  ...  Feature_119  Feature_120  \\\n",
+       "0   0.189483  -0.150790  -2.337072  ...    -0.306457     0.085061   \n",
+       "1   0.293776  -0.188705  -2.214564  ...    -0.384656     0.058515   \n",
+       "2   0.244541  -0.060223  -2.132523  ...    -0.199617     0.119985   \n",
+       "\n",
+       "   Feature_121  Feature_122  Feature_123  Feature_124  Feature_125  \\\n",
+       "0    -0.065240    -0.174579    -0.748717    -0.202958    -0.170341   \n",
+       "1    -0.087278    -0.202737    -0.680734    -0.189267    -0.165939   \n",
+       "2    -0.073416    -0.218369    -0.632460    -0.165810    -0.144961   \n",
+       "\n",
+       "   Feature_126  Feature_127  Feature_128  \n",
+       "0    -0.619031     0.144040     0.159795  \n",
+       "1    -0.563902     0.084017     0.065772  \n",
+       "2    -0.630340     0.159019     0.107950  \n",
+       "\n",
+       "[3 rows x 129 columns]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# view first 5 entries to check it worked\n",
+    "df.head()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "# get a summary of the label colum in df\n",
+    "df['label'].describe()"
+   ]
   }
  ],
  "metadata": {