From 1c1dd6795a57649d04d80aab587f22fc6f4e6064 Mon Sep 17 00:00:00 2001 From: Barb Cutler Date: Tue, 31 Jul 2018 02:13:24 -0400 Subject: [PATCH] Sort processing order & parse json config file (#13) Sort the processing order of users/versions (helps debugging) Moved from command line arguments for each script to parsing the config json --- bin/concatenate_all.py | 31 ++++++----- bin/hash_all.py | 58 ++++++++++----------- bin/tokenize_all.py | 36 +++++++------ compare_hashes/compare_hashes.cpp | 36 +++++++++---- tokenizer/plaintext/plaintext_tokenizer.cpp | 19 +++++-- 5 files changed, 108 insertions(+), 72 deletions(-) diff --git a/bin/concatenate_all.py b/bin/concatenate_all.py index 98124c0..b367223 100644 --- a/bin/concatenate_all.py +++ b/bin/concatenate_all.py @@ -18,9 +18,7 @@ def parse_args(): parser = argparse.ArgumentParser(description="") - parser.add_argument("semester") - parser.add_argument("course") - parser.add_argument("gradeable") + parser.add_argument("config_path") return parser.parse_args() @@ -30,29 +28,35 @@ def main(): sys.stdout.write("CONCATENATE ALL...") sys.stdout.flush() + with open(args.config_path) as lichen_config: + lichen_config_data = json.load(lichen_config) + semester = lichen_config_data["semester"] + course = lichen_config_data["course"] + gradeable = lichen_config_data["gradeable"] + # =========================================================================== # error checking - course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",args.semester,args.course) + course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",semester,course) if not os.path.isdir(course_dir): print("ERROR! ",course_dir," is not a valid course directory") exit(1) - submission_dir=os.path.join(course_dir,"submissions",args.gradeable) + submission_dir=os.path.join(course_dir,"submissions",gradeable) if not os.path.isdir(submission_dir): print("ERROR! ",submission_dir," is not a valid gradeable submissions directory") exit(1) # =========================================================================== # create the directory - concatenated_dir=os.path.join(course_dir,"lichen","concatenated",args.gradeable) + concatenated_dir=os.path.join(course_dir,"lichen","concatenated",gradeable) if not os.path.isdir(concatenated_dir): os.makedirs(concatenated_dir) # =========================================================================== # walk the subdirectories - for user in os.listdir(submission_dir): + for user in sorted(os.listdir(submission_dir)): if not os.path.isdir(os.path.join(submission_dir,user)): continue - for version in os.listdir(os.path.join(submission_dir,user)): + for version in sorted(os.listdir(os.path.join(submission_dir,user))): if not os.path.isdir(os.path.join(submission_dir,user,version)): continue @@ -64,9 +68,9 @@ def main(): my_concatenated_file=os.path.join(my_concatenated_dir,"submission.concatenated") with open(my_concatenated_file,'w') as my_cf: # print a brief header of information - my_cf.write("SEMESTER: "+args.semester+"\n") - my_cf.write("COURSE: "+args.course+"\n") - my_cf.write("GRADEABLE: "+args.gradeable+"\n") + my_cf.write("SEMESTER: "+semester+"\n") + my_cf.write("COURSE: "+course+"\n") + my_cf.write("GRADEABLE: "+gradeable+"\n") my_cf.write("USER: "+user+"\n") my_cf.write("VERSION: "+version+"\n") # loop over all files in all subdirectories @@ -82,9 +86,10 @@ def main(): # print a separator & filename my_cf.write("----------------------------------------------------\n") my_cf.write("FILE: "+relative_path+"\n\n") - with open(absolute_path) as tmp: + with open(absolute_path, encoding='ISO-8859-1') as tmp: # append the contents of the file - my_cf.write(tmp.read()+"\n") + my_cf.write(tmp.read()) + my_cf.write("\n") print ("done") diff --git a/bin/hash_all.py b/bin/hash_all.py index fe32287..50e8686 100644 --- a/bin/hash_all.py +++ b/bin/hash_all.py @@ -23,42 +23,37 @@ def parse_args(): parser = argparse.ArgumentParser(description="") - parser.add_argument("semester") - parser.add_argument("course") - parser.add_argument("gradeable") - parser.add_argument("--window",type=int,default=10) - parser.add_argument("--hash_size",type=int,default=100000) - language = parser.add_mutually_exclusive_group(required=True) - language.add_argument ("--plaintext", action='store_true') - language.add_argument ("--python", action='store_true') - language.add_argument ("--cpp", action='store_true') - + parser.add_argument("config_path") args = parser.parse_args() - - if (args.window < 1): - print ("ERROR! window must be >= 1") - exit(1) - return args def hasher(args,my_tokenized_file,my_hashes_file): - with open(my_tokenized_file,'r') as my_tf: + with open(args.config_path) as lichen_config: + lichen_config_data = json.load(lichen_config) + language = lichen_config_data["language"] + sequence_length = int(lichen_config_data["sequence_length"]) + + if (sequence_length < 1): + print ("ERROR! sequence_length must be >= 1") + exit(1) + + with open(my_tokenized_file,'r',encoding='ISO-8859-1') as my_tf: with open(my_hashes_file,'w') as my_hf: tokens = json.load(my_tf) num = len(tokens) - for i in range(0,num-args.window): + for i in range(0,num-sequence_length): foo="" - if args.plaintext: - for j in range(0,args.window): + if language == "plaintext": + for j in range(0,sequence_length): foo+=str(tokens[i+j].get("value")) - elif args.python: - for j in range(0,args.window): + elif language == "python": + for j in range(0,sequence_length): foo+=str(tokens[i+j].get("type")) - elif args.cpp: - for j in range(0,args.window): + elif language == "cpp": + for j in range(0,sequence_length): foo+=str(tokens[i+j].get("type")) else: @@ -77,26 +72,32 @@ def hasher(args,my_tokenized_file,my_hashes_file): def main(): args = parse_args() + with open(args.config_path) as lichen_config: + lichen_config_data = json.load(lichen_config) + semester = lichen_config_data["semester"] + course = lichen_config_data["course"] + gradeable = lichen_config_data["gradeable"] + sys.stdout.write("HASH ALL...") sys.stdout.flush() # =========================================================================== # error checking - course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",args.semester,args.course) + course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",semester,course) if not os.path.isdir(course_dir): print("ERROR! ",course_dir," is not a valid course directory") exit(1) - tokenized_dir=os.path.join(course_dir,"lichen","tokenized",args.gradeable) + tokenized_dir=os.path.join(course_dir,"lichen","tokenized",gradeable) if not os.path.isdir(tokenized_dir): print("ERROR! ",tokenized_dir," is not a valid gradeable tokenized directory") exit(1) - hashes_dir=os.path.join(course_dir,"lichen","hashes",args.gradeable) + hashes_dir=os.path.join(course_dir,"lichen","hashes",gradeable) # =========================================================================== # walk the subdirectories - for user in os.listdir(tokenized_dir): - for version in os.listdir(os.path.join(tokenized_dir,user)): + for user in sorted(os.listdir(tokenized_dir)): + for version in sorted(os.listdir(os.path.join(tokenized_dir,user))): my_tokenized_file=os.path.join(tokenized_dir,user,version,"tokens.json") # =========================================================================== @@ -108,7 +109,6 @@ def main(): my_hashes_file=os.path.join(my_hashes_dir,"hashes.txt") hasher(args,my_tokenized_file,my_hashes_file) - print("done") if __name__ == "__main__": diff --git a/bin/tokenize_all.py b/bin/tokenize_all.py index 0173716..bf0abde 100644 --- a/bin/tokenize_all.py +++ b/bin/tokenize_all.py @@ -19,32 +19,29 @@ def parse_args(): parser = argparse.ArgumentParser(description="") - parser.add_argument("semester") - parser.add_argument("course") - parser.add_argument("gradeable") - language = parser.add_mutually_exclusive_group(required=True) - language.add_argument ("--plaintext", action='store_true') - language.add_argument ("--python", action='store_true') - language.add_argument ("--cpp", action='store_true') + parser.add_argument("config_path") return parser.parse_args() - def tokenize(args,my_concatenated_file,my_tokenized_file): - if args.plaintext: + with open(args.config_path) as lichen_config: + lichen_config_data = json.load(lichen_config) + language = lichen_config_data["language"] + + if language == "plaintext": tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","plaintext_tokenizer.out") with open(my_concatenated_file,'r') as infile: with open (my_tokenized_file,'w') as outfile: subprocess.call([tokenizer,"--ignore_newlines"],stdin=infile,stdout=outfile) - elif args.python: + elif language == "python": tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","python_tokenizer.py") with open(my_concatenated_file,'r') as infile: with open (my_tokenized_file,'w') as outfile: command="python3 "+str(tokenizer)+" "+my_concatenated_file+" > "+my_tokenized_file os.system(command) - elif args.cpp: + elif language == "cpp": tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","c_tokenizer.py") with open(my_concatenated_file,'r') as infile: with open (my_tokenized_file,'w') as outfile: @@ -62,23 +59,29 @@ def main(): sys.stdout.write("TOKENIZE ALL...") sys.stdout.flush() + with open(args.config_path) as lichen_config: + lichen_config_data = json.load(lichen_config) + semester = lichen_config_data["semester"] + course = lichen_config_data["course"] + gradeable = lichen_config_data["gradeable"] + # =========================================================================== # error checking - course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",args.semester,args.course) + course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",semester,course) if not os.path.isdir(course_dir): print("ERROR! ",course_dir," is not a valid course directory") exit(1) - concatenated_dir=os.path.join(course_dir,"lichen","concatenated",args.gradeable) + concatenated_dir=os.path.join(course_dir,"lichen","concatenated",gradeable) if not os.path.isdir(concatenated_dir): print("ERROR! ",concatenated_dir," is not a valid gradeable concatenated directory") exit(1) - tokenized_dir=os.path.join(course_dir,"lichen","tokenized",args.gradeable) + tokenized_dir=os.path.join(course_dir,"lichen","tokenized",gradeable) # =========================================================================== # walk the subdirectories - for user in os.listdir(concatenated_dir): - for version in os.listdir(os.path.join(concatenated_dir,user)): + for user in sorted(os.listdir(concatenated_dir)): + for version in sorted(os.listdir(os.path.join(concatenated_dir,user))): my_concatenated_file=os.path.join(concatenated_dir,user,version,"submission.concatenated") # =========================================================================== @@ -86,7 +89,6 @@ def main(): my_tokenized_dir=os.path.join(tokenized_dir,user,version) if not os.path.isdir(my_tokenized_dir): os.makedirs(my_tokenized_dir) - my_tokenized_file=os.path.join(my_tokenized_dir,"tokens.json") tokenize(args,my_concatenated_file,my_tokenized_file) diff --git a/compare_hashes/compare_hashes.cpp b/compare_hashes/compare_hashes.cpp index a774fb3..80f3a6c 100644 --- a/compare_hashes/compare_hashes.cpp +++ b/compare_hashes/compare_hashes.cpp @@ -125,13 +125,23 @@ int main(int argc, char* argv[]) { // --------------------------------------------------------------------------- // deal with command line arguments - assert (argc == 6); - std::string semester = argv[1]; - std::string course = argv[2]; - std::string gradeable = argv[3]; - assert (argv[4] == std::string("--window")); - int window = std::stoi(std::string(argv[5])); - assert (window >= 1); + assert (argc == 2); + std::string config_file = argv[1]; + + std::ifstream istr(config_file.c_str()); + assert (istr.good()); + nlohmann::json config_file_json = nlohmann::json::parse(istr); + + std::string semester = config_file_json.value("semester","ERROR"); + std::string course = config_file_json.value("course","ERROR"); + std::string gradeable = config_file_json.value("gradeable","ERROR"); + std::string sequence_length_str = config_file_json.value("sequence_length","1"); + int sequence_length = std::stoi(sequence_length_str); + std::string threshold_str = config_file_json.value("threshold","5"); + int threshold = std::stoi(threshold_str); + + assert (sequence_length >= 1); + assert (threshold >= 2); // error checking, confirm there are hashes to work with std::string tmp = "/var/local/submitty/courses/"+semester+"/"+course+"/lichen/hashes/"+gradeable; @@ -180,6 +190,8 @@ int main(int argc, char* argv[]) { } } + std::cout << "finished loading" << std::endl; + // --------------------------------------------------------------------------- // label the parts of the file that are common to many @@ -194,6 +206,7 @@ int main(int argc, char* argv[]) { // user,version -> ( position -> ( other user,version -> std::vector ) ) std::map > > > suspicious; + int my_counter = 0; // --------------------------------------------------------------------------- // walk over the structure containing all of the hashes identifying @@ -201,14 +214,18 @@ int main(int argc, char* argv[]) { for (hashed_sequences::iterator itr = hash_counts.begin(); itr != hash_counts.end(); itr++) { int count = itr->second.size(); - if (count >= 20) { + my_counter++; + + std::cout << "hash walk " << hash_counts.size() << " " << my_counter << std::endl; + + if (count > threshold) { // common to many/all for (std::map >::iterator itr2 = itr->second.begin(); itr2 != itr->second.end(); itr2++) { for (int i = 0; i < itr2->second.size(); i++) { common[itr2->second[i].submission].insert(itr2->second[i].position); } } - } else if (count > 1 && count < 20) { + } else if (count > 1 && count <= threshold) { // suspicious matches for (std::map >::iterator itr2 = itr->second.begin(); itr2 != itr->second.end(); itr2++) { std::string username = itr2->first; @@ -234,6 +251,7 @@ int main(int argc, char* argv[]) { } } + std::cout << "finished walking" << std::endl; // --------------------------------------------------------------------------- // prepare a sorted list of all users sorted by match percent diff --git a/tokenizer/plaintext/plaintext_tokenizer.cpp b/tokenizer/plaintext/plaintext_tokenizer.cpp index 633c56d..089b4ba 100644 --- a/tokenizer/plaintext/plaintext_tokenizer.cpp +++ b/tokenizer/plaintext/plaintext_tokenizer.cpp @@ -13,6 +13,19 @@ void usage(const std::string &program) { } +void deal_with_number(std::map& tmp, const std::string& token) { + try { + // normal case, convert to integer + tmp["type"]="number"; + tmp["value"]=std::stoi(token); + } catch (...) { + // if conversion fails (integer too big!) + tmp["type"]="string"; + tmp["value"]=token; + } +} + + int main(int argc, char* argv[]) { // ------------------------------ @@ -72,8 +85,7 @@ int main(int argc, char* argv[]) { tmp["char"]=start_col; if (last_was_digit) { assert (!last_was_alpha); - tmp["type"]="number"; - tmp["value"]=std::stoi(token); + deal_with_number(tmp,token); } else { assert (last_was_alpha); tmp["type"]="string"; @@ -171,8 +183,7 @@ int main(int argc, char* argv[]) { tmp["char"]=start_col; if (last_was_digit) { assert (!last_was_alpha); - tmp["type"]="number"; - tmp["value"]=std::stoi(token); + deal_with_number(tmp,token); } else { assert (last_was_alpha); tmp["type"]="string";