From 1c1dd6795a57649d04d80aab587f22fc6f4e6064 Mon Sep 17 00:00:00 2001
From: Barb Cutler <bmcutler@users.noreply.github.com>
Date: Tue, 31 Jul 2018 02:13:24 -0400
Subject: [PATCH] Sort processing order & parse json config file (#13)

Sort the processing order of users/versions (helps debugging)
Moved from command line arguments for each script to parsing the config json
---
 bin/concatenate_all.py                      | 31 ++++++-----
 bin/hash_all.py                             | 58 ++++++++++-----------
 bin/tokenize_all.py                         | 36 +++++++------
 compare_hashes/compare_hashes.cpp           | 36 +++++++++----
 tokenizer/plaintext/plaintext_tokenizer.cpp | 19 +++++--
 5 files changed, 108 insertions(+), 72 deletions(-)

diff --git a/bin/concatenate_all.py b/bin/concatenate_all.py
index 98124c0..b367223 100644
--- a/bin/concatenate_all.py
+++ b/bin/concatenate_all.py
@@ -18,9 +18,7 @@
 
 def parse_args():
     parser = argparse.ArgumentParser(description="")
-    parser.add_argument("semester")
-    parser.add_argument("course")
-    parser.add_argument("gradeable")
+    parser.add_argument("config_path")
     return parser.parse_args()
 
 
@@ -30,29 +28,35 @@ def main():
     sys.stdout.write("CONCATENATE ALL...")
     sys.stdout.flush()
 
+    with open(args.config_path) as lichen_config:
+        lichen_config_data = json.load(lichen_config)
+        semester = lichen_config_data["semester"]
+        course = lichen_config_data["course"]
+        gradeable = lichen_config_data["gradeable"]
+
     # ===========================================================================
     # error checking
-    course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",args.semester,args.course)
+    course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",semester,course)
     if not os.path.isdir(course_dir):
         print("ERROR! ",course_dir," is not a valid course directory")
         exit(1)
-    submission_dir=os.path.join(course_dir,"submissions",args.gradeable)
+    submission_dir=os.path.join(course_dir,"submissions",gradeable)
     if not os.path.isdir(submission_dir):
         print("ERROR! ",submission_dir," is not a valid gradeable submissions directory")
         exit(1)
 
     # ===========================================================================
     # create the directory
-    concatenated_dir=os.path.join(course_dir,"lichen","concatenated",args.gradeable)
+    concatenated_dir=os.path.join(course_dir,"lichen","concatenated",gradeable)
     if not os.path.isdir(concatenated_dir):
          os.makedirs(concatenated_dir)
 
     # ===========================================================================
     # walk the subdirectories
-    for user in os.listdir(submission_dir):
+    for user in sorted(os.listdir(submission_dir)):
         if not os.path.isdir(os.path.join(submission_dir,user)):
             continue
-        for version in os.listdir(os.path.join(submission_dir,user)):
+        for version in sorted(os.listdir(os.path.join(submission_dir,user))):
             if not os.path.isdir(os.path.join(submission_dir,user,version)):
                 continue
 
@@ -64,9 +68,9 @@ def main():
             my_concatenated_file=os.path.join(my_concatenated_dir,"submission.concatenated")
             with open(my_concatenated_file,'w') as my_cf:
                 # print a brief header of information
-                my_cf.write("SEMESTER: "+args.semester+"\n")
-                my_cf.write("COURSE: "+args.course+"\n")
-                my_cf.write("GRADEABLE: "+args.gradeable+"\n")
+                my_cf.write("SEMESTER: "+semester+"\n")
+                my_cf.write("COURSE: "+course+"\n")
+                my_cf.write("GRADEABLE: "+gradeable+"\n")
                 my_cf.write("USER: "+user+"\n")
                 my_cf.write("VERSION: "+version+"\n")
                 # loop over all files in all subdirectories
@@ -82,9 +86,10 @@ def main():
                         # print a separator & filename
                         my_cf.write("----------------------------------------------------\n")
                         my_cf.write("FILE: "+relative_path+"\n\n")
-                        with open(absolute_path) as tmp:
+                        with open(absolute_path, encoding='ISO-8859-1') as tmp:
                             # append the contents of the file
-                            my_cf.write(tmp.read()+"\n")
+                            my_cf.write(tmp.read())
+                        my_cf.write("\n")
 
     print ("done")
                             
diff --git a/bin/hash_all.py b/bin/hash_all.py
index fe32287..50e8686 100644
--- a/bin/hash_all.py
+++ b/bin/hash_all.py
@@ -23,42 +23,37 @@
 
 def parse_args():
     parser = argparse.ArgumentParser(description="")
-    parser.add_argument("semester")
-    parser.add_argument("course")
-    parser.add_argument("gradeable")
-    parser.add_argument("--window",type=int,default=10)
-    parser.add_argument("--hash_size",type=int,default=100000)
-    language = parser.add_mutually_exclusive_group(required=True)
-    language.add_argument ("--plaintext", action='store_true')
-    language.add_argument ("--python", action='store_true')
-    language.add_argument ("--cpp", action='store_true')
-
+    parser.add_argument("config_path")
     args = parser.parse_args()
-
-    if (args.window < 1):
-        print ("ERROR! window must be >= 1")
-        exit(1)
-    
     return args
 
 
 def hasher(args,my_tokenized_file,my_hashes_file):
-    with open(my_tokenized_file,'r') as my_tf:
+    with open(args.config_path) as lichen_config:
+        lichen_config_data = json.load(lichen_config)
+        language = lichen_config_data["language"]
+        sequence_length = int(lichen_config_data["sequence_length"])
+
+    if (sequence_length < 1):
+        print ("ERROR! sequence_length must be >= 1")
+        exit(1)
+
+    with open(my_tokenized_file,'r',encoding='ISO-8859-1') as my_tf:
         with open(my_hashes_file,'w') as my_hf:
             tokens = json.load(my_tf)
             num = len(tokens)
-            for i in range(0,num-args.window):
+            for i in range(0,num-sequence_length):
                 foo=""
-                if args.plaintext:
-                    for j in range(0,args.window):
+                if language == "plaintext":
+                    for j in range(0,sequence_length):
                         foo+=str(tokens[i+j].get("value"))
 
-                elif args.python:
-                    for j in range(0,args.window):
+                elif language == "python":
+                    for j in range(0,sequence_length):
                         foo+=str(tokens[i+j].get("type"))
 
-                elif args.cpp:
-                    for j in range(0,args.window):
+                elif language == "cpp":
+                    for j in range(0,sequence_length):
                         foo+=str(tokens[i+j].get("type"))
 
                 else:
@@ -77,26 +72,32 @@ def hasher(args,my_tokenized_file,my_hashes_file):
 def main():
     args = parse_args()
 
+    with open(args.config_path) as lichen_config:
+        lichen_config_data = json.load(lichen_config)
+        semester = lichen_config_data["semester"]
+        course = lichen_config_data["course"]
+        gradeable = lichen_config_data["gradeable"]
+
     sys.stdout.write("HASH ALL...")
     sys.stdout.flush()
     
     # ===========================================================================
     # error checking
-    course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",args.semester,args.course)
+    course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",semester,course)
     if not os.path.isdir(course_dir):
         print("ERROR! ",course_dir," is not a valid course directory")
         exit(1)
-    tokenized_dir=os.path.join(course_dir,"lichen","tokenized",args.gradeable) 
+    tokenized_dir=os.path.join(course_dir,"lichen","tokenized",gradeable)
     if not os.path.isdir(tokenized_dir):
         print("ERROR! ",tokenized_dir," is not a valid gradeable tokenized directory")
         exit(1)
 
-    hashes_dir=os.path.join(course_dir,"lichen","hashes",args.gradeable)
+    hashes_dir=os.path.join(course_dir,"lichen","hashes",gradeable)
 
     # ===========================================================================
     # walk the subdirectories
-    for user in os.listdir(tokenized_dir):
-        for version in os.listdir(os.path.join(tokenized_dir,user)):
+    for user in sorted(os.listdir(tokenized_dir)):
+        for version in sorted(os.listdir(os.path.join(tokenized_dir,user))):
             my_tokenized_file=os.path.join(tokenized_dir,user,version,"tokens.json")
 
             # ===========================================================================
@@ -108,7 +109,6 @@ def main():
             my_hashes_file=os.path.join(my_hashes_dir,"hashes.txt")
             hasher(args,my_tokenized_file,my_hashes_file)
 
-
     print("done")
             
 if __name__ == "__main__":
diff --git a/bin/tokenize_all.py b/bin/tokenize_all.py
index 0173716..bf0abde 100644
--- a/bin/tokenize_all.py
+++ b/bin/tokenize_all.py
@@ -19,32 +19,29 @@
 
 def parse_args():
     parser = argparse.ArgumentParser(description="")
-    parser.add_argument("semester")
-    parser.add_argument("course")
-    parser.add_argument("gradeable")
-    language = parser.add_mutually_exclusive_group(required=True)
-    language.add_argument ("--plaintext", action='store_true')
-    language.add_argument ("--python", action='store_true')
-    language.add_argument ("--cpp", action='store_true')
+    parser.add_argument("config_path")
     return parser.parse_args()
 
-
 def tokenize(args,my_concatenated_file,my_tokenized_file):
 
-    if args.plaintext:
+    with open(args.config_path) as lichen_config:
+        lichen_config_data = json.load(lichen_config)
+        language = lichen_config_data["language"]
+
+    if language == "plaintext":
         tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","plaintext_tokenizer.out")
         with open(my_concatenated_file,'r') as infile:
             with open (my_tokenized_file,'w') as outfile:
                 subprocess.call([tokenizer,"--ignore_newlines"],stdin=infile,stdout=outfile)
 
-    elif args.python:
+    elif language == "python":
         tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","python_tokenizer.py")
         with open(my_concatenated_file,'r') as infile:
             with open (my_tokenized_file,'w') as outfile:
                 command="python3 "+str(tokenizer)+" "+my_concatenated_file+" > "+my_tokenized_file
                 os.system(command)
 
-    elif args.cpp:
+    elif language == "cpp":
         tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","c_tokenizer.py")
         with open(my_concatenated_file,'r') as infile:
             with open (my_tokenized_file,'w') as outfile:
@@ -62,23 +59,29 @@ def main():
     sys.stdout.write("TOKENIZE ALL...")
     sys.stdout.flush()
     
+    with open(args.config_path) as lichen_config:
+        lichen_config_data = json.load(lichen_config)
+        semester = lichen_config_data["semester"]
+        course = lichen_config_data["course"]
+        gradeable = lichen_config_data["gradeable"]
+
     # ===========================================================================
     # error checking
-    course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",args.semester,args.course)
+    course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",semester,course)
     if not os.path.isdir(course_dir):
         print("ERROR! ",course_dir," is not a valid course directory")
         exit(1)
-    concatenated_dir=os.path.join(course_dir,"lichen","concatenated",args.gradeable)
+    concatenated_dir=os.path.join(course_dir,"lichen","concatenated",gradeable)
     if not os.path.isdir(concatenated_dir):
         print("ERROR! ",concatenated_dir," is not a valid gradeable concatenated directory")
         exit(1)
 
-    tokenized_dir=os.path.join(course_dir,"lichen","tokenized",args.gradeable)
+    tokenized_dir=os.path.join(course_dir,"lichen","tokenized",gradeable)
 
     # ===========================================================================
     # walk the subdirectories
-    for user in os.listdir(concatenated_dir):
-        for version in os.listdir(os.path.join(concatenated_dir,user)):
+    for user in sorted(os.listdir(concatenated_dir)):
+        for version in sorted(os.listdir(os.path.join(concatenated_dir,user))):
             my_concatenated_file=os.path.join(concatenated_dir,user,version,"submission.concatenated")
 
             # ===========================================================================
@@ -86,7 +89,6 @@ def main():
             my_tokenized_dir=os.path.join(tokenized_dir,user,version)
             if not os.path.isdir(my_tokenized_dir):
                 os.makedirs(my_tokenized_dir)
-
             my_tokenized_file=os.path.join(my_tokenized_dir,"tokens.json")
             tokenize(args,my_concatenated_file,my_tokenized_file)
 
diff --git a/compare_hashes/compare_hashes.cpp b/compare_hashes/compare_hashes.cpp
index a774fb3..80f3a6c 100644
--- a/compare_hashes/compare_hashes.cpp
+++ b/compare_hashes/compare_hashes.cpp
@@ -125,13 +125,23 @@ int main(int argc, char* argv[]) {
 
   // ---------------------------------------------------------------------------
   // deal with command line arguments
-  assert (argc == 6);
-  std::string semester = argv[1];
-  std::string course = argv[2];
-  std::string gradeable = argv[3];
-  assert (argv[4] == std::string("--window"));
-  int window = std::stoi(std::string(argv[5]));
-  assert (window >= 1);
+  assert (argc == 2);
+  std::string config_file = argv[1];
+
+  std::ifstream istr(config_file.c_str());
+  assert (istr.good());
+  nlohmann::json config_file_json = nlohmann::json::parse(istr);
+
+  std::string semester = config_file_json.value("semester","ERROR");
+  std::string course = config_file_json.value("course","ERROR");
+  std::string gradeable = config_file_json.value("gradeable","ERROR");
+  std::string sequence_length_str = config_file_json.value("sequence_length","1");
+  int sequence_length = std::stoi(sequence_length_str);
+  std::string threshold_str = config_file_json.value("threshold","5");
+  int threshold = std::stoi(threshold_str);
+
+  assert (sequence_length >= 1);
+  assert (threshold >= 2);
 
   // error checking, confirm there are hashes to work with
   std::string tmp = "/var/local/submitty/courses/"+semester+"/"+course+"/lichen/hashes/"+gradeable;
@@ -180,6 +190,8 @@ int main(int argc, char* argv[]) {
     }    
   }
 
+  std::cout << "finished loading" << std::endl;
+
   // ---------------------------------------------------------------------------
 
   // label the parts of the file that are common to many
@@ -194,6 +206,7 @@ int main(int argc, char* argv[]) {
   // user,version -> ( position -> ( other user,version -> std::vector<Sequence> ) )
   std::map<Submission,std::map<int,std::map<Submission, std::vector<Sequence> > > > suspicious;
 
+  int my_counter = 0;
 
   // ---------------------------------------------------------------------------
   // walk over the structure containing all of the hashes identifying
@@ -201,14 +214,18 @@ int main(int argc, char* argv[]) {
   for (hashed_sequences::iterator itr = hash_counts.begin(); itr != hash_counts.end(); itr++) {
     int count = itr->second.size();
 
-    if (count >= 20) {
+    my_counter++;
+
+    std::cout << "hash walk " << hash_counts.size() << " " << my_counter << std::endl;
+
+    if (count > threshold) {
       // common to many/all
       for (std::map<std::string,std::vector<Sequence> >::iterator itr2 = itr->second.begin(); itr2 != itr->second.end(); itr2++) {
         for (int i = 0; i < itr2->second.size(); i++) {
           common[itr2->second[i].submission].insert(itr2->second[i].position);
         }
       }
-    } else if (count > 1 && count < 20) {
+    } else if (count > 1 && count <= threshold) {
       // suspicious matches
       for (std::map<std::string,std::vector<Sequence> >::iterator itr2 = itr->second.begin(); itr2 != itr->second.end(); itr2++) {
         std::string username = itr2->first;
@@ -234,6 +251,7 @@ int main(int argc, char* argv[]) {
     }
   }
 
+  std::cout << "finished walking" << std::endl;
 
   // ---------------------------------------------------------------------------
   // prepare a sorted list of all users sorted by match percent
diff --git a/tokenizer/plaintext/plaintext_tokenizer.cpp b/tokenizer/plaintext/plaintext_tokenizer.cpp
index 633c56d..089b4ba 100644
--- a/tokenizer/plaintext/plaintext_tokenizer.cpp
+++ b/tokenizer/plaintext/plaintext_tokenizer.cpp
@@ -13,6 +13,19 @@ void usage(const std::string &program) {
 }
 
 
+void deal_with_number(std::map<std::string,nlohmann::json>& tmp, const std::string& token) {
+  try {
+    // normal case, convert to integer
+    tmp["type"]="number";
+    tmp["value"]=std::stoi(token);
+  } catch (...) {
+    // if conversion fails (integer too big!)
+    tmp["type"]="string";
+    tmp["value"]=token;
+  }
+}
+
+
 int main(int argc, char* argv[]) {
 
   // ------------------------------
@@ -72,8 +85,7 @@ int main(int argc, char* argv[]) {
         tmp["char"]=start_col;
         if (last_was_digit) {
           assert (!last_was_alpha);
-          tmp["type"]="number";
-          tmp["value"]=std::stoi(token);
+	  deal_with_number(tmp,token);
         } else {
           assert (last_was_alpha);
           tmp["type"]="string";
@@ -171,8 +183,7 @@ int main(int argc, char* argv[]) {
     tmp["char"]=start_col;
     if (last_was_digit) {
       assert (!last_was_alpha);
-      tmp["type"]="number";
-      tmp["value"]=std::stoi(token);
+      deal_with_number(tmp,token);
     } else {
       assert (last_was_alpha);
       tmp["type"]="string";