Skip to content

Commit

Permalink
Convert token frequency counter type to long for huge data set
Browse files Browse the repository at this point in the history
  • Loading branch information
zhongkaifu committed Oct 20, 2023
1 parent 3e99859 commit 87b3ad6
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 16 deletions.
10 changes: 5 additions & 5 deletions Seq2SeqSharp/Corpus/CorpusBatch.cs
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,8 @@ public List<List<string>> InitializeHypTokens(string prefix)


// count up all words
public static List<Dictionary<string, int>> s_ds = new List<Dictionary<string, int>>();
public static List<Dictionary<string, int>> t_ds = new List<Dictionary<string, int>>();
public static List<Dictionary<string, long>> s_ds = new List<Dictionary<string, long>>();
public static List<Dictionary<string, long>> t_ds = new List<Dictionary<string, long>>();



Expand All @@ -156,7 +156,7 @@ static public void MergeTokensCountSrcTgt(int srcGroupIdx, int tgtGroupIdx)
static public void ReduceSrcTokensToSingleGroup()
{
Logger.WriteLine($"Reduce source vocabs group from '{s_ds.Count}' to 1");
Dictionary<string, int> rst = new Dictionary<string, int>();
Dictionary<string, long> rst = new Dictionary<string, long>();

foreach (var dict in s_ds)
{
Expand Down Expand Up @@ -206,14 +206,14 @@ static public (List<Vocab>, List<Vocab>) GenerateVocabs(int srcVocabSize = 45000
return (srcVocabs, tgtVocabs);
}

private static List<Vocab> InnerBuildVocab(int vocabSize, List<Dictionary<string, int>> ds, string tag, int minFreq = 1)
private static List<Vocab> InnerBuildVocab(int vocabSize, List<Dictionary<string, long>> ds, string tag, int minFreq = 1)
{
List<Vocab> vocabs = new List<Vocab>();

for (int i = 0; i < ds.Count; i++)
{
Vocab vocab = new Vocab();
SortedDictionary<int, List<string>> sd = new SortedDictionary<int, List<string>>();
SortedDictionary<long, List<string>> sd = new SortedDictionary<long, List<string>>();

var s_d = ds[i];
foreach (var kv in s_d)
Expand Down
6 changes: 3 additions & 3 deletions Seq2SeqSharp/Corpus/MonoCorpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ namespace Seq2SeqSharp.Tools
private int m_batchNumInTotal = 0;
private int m_startBatchId = 0;

public List<Dictionary<string, int>> CountTokenFreqs()
public List<Dictionary<string, long>> CountTokenFreqs()
{
List<Dictionary<string, int>> td = new List<Dictionary<string, int>>();
List<Dictionary<string, long>> td = new List<Dictionary<string, long>>();

for (int i = 0; i < m_tgtFileList.Count; i++)
{
Expand Down Expand Up @@ -70,7 +70,7 @@ public List<Dictionary<string, int>> CountTokenFreqs()
{
for (int j = 0; j < tgtGroups.Length; j++)
{
td.Add(new Dictionary<string, int>());
td.Add(new Dictionary<string, long>());
}
}

Expand Down
10 changes: 5 additions & 5 deletions Seq2SeqSharp/Corpus/ParallelCorpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,10 @@ public interface ICorpus<out T> : IEnumerable<T>
private string m_sortedIndexedDataSetFilePath = "";
private int m_batchNumInTotal = 0;

public (List<Dictionary<string, int>>, List<Dictionary<string, int>>) CountTokenFreqs()
public (List<Dictionary<string, long>>, List<Dictionary<string, long>>) CountTokenFreqs()
{
List<Dictionary<string, int>> sd = new List<Dictionary<string, int>>();
List<Dictionary<string, int>> td = new List<Dictionary<string, int>>();
List<Dictionary<string, long>> sd = new List<Dictionary<string, long>>();
List<Dictionary<string, long>> td = new List<Dictionary<string, long>>();

for (int i = 0; i < m_srcFileList.Count; i++)
{
Expand Down Expand Up @@ -90,8 +90,8 @@ public interface ICorpus<out T> : IEnumerable<T>
{
for (int j = 0; j < srcGroups.Length; j++)
{
sd.Add(new Dictionary<string, int>());
td.Add(new Dictionary<string, int>());
sd.Add(new Dictionary<string, long>());
td.Add(new Dictionary<string, long>());
}
}

Expand Down
6 changes: 3 additions & 3 deletions Seq2SeqSharp/Corpus/VisionTextCorpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@ namespace Seq2SeqSharp.Corpus
private string m_sortedIndexedDataSetFilePath = "";
private int m_batchNumInTotal = 0;

public List<Dictionary<string, int>> CountTokenFreqs()
public List<Dictionary<string, long>> CountTokenFreqs()
{
List<Dictionary<string, int>> td = new List<Dictionary<string, int>>();
List<Dictionary<string, long>> td = new List<Dictionary<string, long>>();

for (int i = 0; i < m_tgtFileList.Count; i++)
{
Expand All @@ -77,7 +77,7 @@ public List<Dictionary<string, int>> CountTokenFreqs()
{
for (int j = 0; j < tgtGroups.Length; j++)
{
td.Add(new Dictionary<string, int>());
td.Add(new Dictionary<string, long>());
}
}

Expand Down

0 comments on commit 87b3ad6

Please sign in to comment.