Skip to content

Commit

Permalink
Sort processing order & parse json config file (#13)
Browse files Browse the repository at this point in the history
Sort the processing order of users/versions (helps debugging)
Moved from command line arguments for each script to parsing the config json
  • Loading branch information
bmcutler authored Jul 31, 2018
1 parent 4c25f0c commit 1c1dd67
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 72 deletions.
31 changes: 18 additions & 13 deletions bin/concatenate_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,7 @@

def parse_args():
parser = argparse.ArgumentParser(description="")
parser.add_argument("semester")
parser.add_argument("course")
parser.add_argument("gradeable")
parser.add_argument("config_path")
return parser.parse_args()


Expand All @@ -30,29 +28,35 @@ def main():
sys.stdout.write("CONCATENATE ALL...")
sys.stdout.flush()

with open(args.config_path) as lichen_config:
lichen_config_data = json.load(lichen_config)
semester = lichen_config_data["semester"]
course = lichen_config_data["course"]
gradeable = lichen_config_data["gradeable"]

# ===========================================================================
# error checking
course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",args.semester,args.course)
course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",semester,course)
if not os.path.isdir(course_dir):
print("ERROR! ",course_dir," is not a valid course directory")
exit(1)
submission_dir=os.path.join(course_dir,"submissions",args.gradeable)
submission_dir=os.path.join(course_dir,"submissions",gradeable)
if not os.path.isdir(submission_dir):
print("ERROR! ",submission_dir," is not a valid gradeable submissions directory")
exit(1)

# ===========================================================================
# create the directory
concatenated_dir=os.path.join(course_dir,"lichen","concatenated",args.gradeable)
concatenated_dir=os.path.join(course_dir,"lichen","concatenated",gradeable)
if not os.path.isdir(concatenated_dir):
os.makedirs(concatenated_dir)

# ===========================================================================
# walk the subdirectories
for user in os.listdir(submission_dir):
for user in sorted(os.listdir(submission_dir)):
if not os.path.isdir(os.path.join(submission_dir,user)):
continue
for version in os.listdir(os.path.join(submission_dir,user)):
for version in sorted(os.listdir(os.path.join(submission_dir,user))):
if not os.path.isdir(os.path.join(submission_dir,user,version)):
continue

Expand All @@ -64,9 +68,9 @@ def main():
my_concatenated_file=os.path.join(my_concatenated_dir,"submission.concatenated")
with open(my_concatenated_file,'w') as my_cf:
# print a brief header of information
my_cf.write("SEMESTER: "+args.semester+"\n")
my_cf.write("COURSE: "+args.course+"\n")
my_cf.write("GRADEABLE: "+args.gradeable+"\n")
my_cf.write("SEMESTER: "+semester+"\n")
my_cf.write("COURSE: "+course+"\n")
my_cf.write("GRADEABLE: "+gradeable+"\n")
my_cf.write("USER: "+user+"\n")
my_cf.write("VERSION: "+version+"\n")
# loop over all files in all subdirectories
Expand All @@ -82,9 +86,10 @@ def main():
# print a separator & filename
my_cf.write("----------------------------------------------------\n")
my_cf.write("FILE: "+relative_path+"\n\n")
with open(absolute_path) as tmp:
with open(absolute_path, encoding='ISO-8859-1') as tmp:
# append the contents of the file
my_cf.write(tmp.read()+"\n")
my_cf.write(tmp.read())
my_cf.write("\n")

print ("done")

Expand Down
58 changes: 29 additions & 29 deletions bin/hash_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,42 +23,37 @@

def parse_args():
parser = argparse.ArgumentParser(description="")
parser.add_argument("semester")
parser.add_argument("course")
parser.add_argument("gradeable")
parser.add_argument("--window",type=int,default=10)
parser.add_argument("--hash_size",type=int,default=100000)
language = parser.add_mutually_exclusive_group(required=True)
language.add_argument ("--plaintext", action='store_true')
language.add_argument ("--python", action='store_true')
language.add_argument ("--cpp", action='store_true')

parser.add_argument("config_path")
args = parser.parse_args()

if (args.window < 1):
print ("ERROR! window must be >= 1")
exit(1)

return args


def hasher(args,my_tokenized_file,my_hashes_file):
with open(my_tokenized_file,'r') as my_tf:
with open(args.config_path) as lichen_config:
lichen_config_data = json.load(lichen_config)
language = lichen_config_data["language"]
sequence_length = int(lichen_config_data["sequence_length"])

if (sequence_length < 1):
print ("ERROR! sequence_length must be >= 1")
exit(1)

with open(my_tokenized_file,'r',encoding='ISO-8859-1') as my_tf:
with open(my_hashes_file,'w') as my_hf:
tokens = json.load(my_tf)
num = len(tokens)
for i in range(0,num-args.window):
for i in range(0,num-sequence_length):
foo=""
if args.plaintext:
for j in range(0,args.window):
if language == "plaintext":
for j in range(0,sequence_length):
foo+=str(tokens[i+j].get("value"))

elif args.python:
for j in range(0,args.window):
elif language == "python":
for j in range(0,sequence_length):
foo+=str(tokens[i+j].get("type"))

elif args.cpp:
for j in range(0,args.window):
elif language == "cpp":
for j in range(0,sequence_length):
foo+=str(tokens[i+j].get("type"))

else:
Expand All @@ -77,26 +72,32 @@ def hasher(args,my_tokenized_file,my_hashes_file):
def main():
args = parse_args()

with open(args.config_path) as lichen_config:
lichen_config_data = json.load(lichen_config)
semester = lichen_config_data["semester"]
course = lichen_config_data["course"]
gradeable = lichen_config_data["gradeable"]

sys.stdout.write("HASH ALL...")
sys.stdout.flush()

# ===========================================================================
# error checking
course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",args.semester,args.course)
course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",semester,course)
if not os.path.isdir(course_dir):
print("ERROR! ",course_dir," is not a valid course directory")
exit(1)
tokenized_dir=os.path.join(course_dir,"lichen","tokenized",args.gradeable)
tokenized_dir=os.path.join(course_dir,"lichen","tokenized",gradeable)
if not os.path.isdir(tokenized_dir):
print("ERROR! ",tokenized_dir," is not a valid gradeable tokenized directory")
exit(1)

hashes_dir=os.path.join(course_dir,"lichen","hashes",args.gradeable)
hashes_dir=os.path.join(course_dir,"lichen","hashes",gradeable)

# ===========================================================================
# walk the subdirectories
for user in os.listdir(tokenized_dir):
for version in os.listdir(os.path.join(tokenized_dir,user)):
for user in sorted(os.listdir(tokenized_dir)):
for version in sorted(os.listdir(os.path.join(tokenized_dir,user))):
my_tokenized_file=os.path.join(tokenized_dir,user,version,"tokens.json")

# ===========================================================================
Expand All @@ -108,7 +109,6 @@ def main():
my_hashes_file=os.path.join(my_hashes_dir,"hashes.txt")
hasher(args,my_tokenized_file,my_hashes_file)


print("done")

if __name__ == "__main__":
Expand Down
36 changes: 19 additions & 17 deletions bin/tokenize_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,32 +19,29 @@

def parse_args():
parser = argparse.ArgumentParser(description="")
parser.add_argument("semester")
parser.add_argument("course")
parser.add_argument("gradeable")
language = parser.add_mutually_exclusive_group(required=True)
language.add_argument ("--plaintext", action='store_true')
language.add_argument ("--python", action='store_true')
language.add_argument ("--cpp", action='store_true')
parser.add_argument("config_path")
return parser.parse_args()


def tokenize(args,my_concatenated_file,my_tokenized_file):

if args.plaintext:
with open(args.config_path) as lichen_config:
lichen_config_data = json.load(lichen_config)
language = lichen_config_data["language"]

if language == "plaintext":
tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","plaintext_tokenizer.out")
with open(my_concatenated_file,'r') as infile:
with open (my_tokenized_file,'w') as outfile:
subprocess.call([tokenizer,"--ignore_newlines"],stdin=infile,stdout=outfile)

elif args.python:
elif language == "python":
tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","python_tokenizer.py")
with open(my_concatenated_file,'r') as infile:
with open (my_tokenized_file,'w') as outfile:
command="python3 "+str(tokenizer)+" "+my_concatenated_file+" > "+my_tokenized_file
os.system(command)

elif args.cpp:
elif language == "cpp":
tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","c_tokenizer.py")
with open(my_concatenated_file,'r') as infile:
with open (my_tokenized_file,'w') as outfile:
Expand All @@ -62,31 +59,36 @@ def main():
sys.stdout.write("TOKENIZE ALL...")
sys.stdout.flush()

with open(args.config_path) as lichen_config:
lichen_config_data = json.load(lichen_config)
semester = lichen_config_data["semester"]
course = lichen_config_data["course"]
gradeable = lichen_config_data["gradeable"]

# ===========================================================================
# error checking
course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",args.semester,args.course)
course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",semester,course)
if not os.path.isdir(course_dir):
print("ERROR! ",course_dir," is not a valid course directory")
exit(1)
concatenated_dir=os.path.join(course_dir,"lichen","concatenated",args.gradeable)
concatenated_dir=os.path.join(course_dir,"lichen","concatenated",gradeable)
if not os.path.isdir(concatenated_dir):
print("ERROR! ",concatenated_dir," is not a valid gradeable concatenated directory")
exit(1)

tokenized_dir=os.path.join(course_dir,"lichen","tokenized",args.gradeable)
tokenized_dir=os.path.join(course_dir,"lichen","tokenized",gradeable)

# ===========================================================================
# walk the subdirectories
for user in os.listdir(concatenated_dir):
for version in os.listdir(os.path.join(concatenated_dir,user)):
for user in sorted(os.listdir(concatenated_dir)):
for version in sorted(os.listdir(os.path.join(concatenated_dir,user))):
my_concatenated_file=os.path.join(concatenated_dir,user,version,"submission.concatenated")

# ===========================================================================
# create the directory
my_tokenized_dir=os.path.join(tokenized_dir,user,version)
if not os.path.isdir(my_tokenized_dir):
os.makedirs(my_tokenized_dir)

my_tokenized_file=os.path.join(my_tokenized_dir,"tokens.json")
tokenize(args,my_concatenated_file,my_tokenized_file)

Expand Down
36 changes: 27 additions & 9 deletions compare_hashes/compare_hashes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,13 +125,23 @@ int main(int argc, char* argv[]) {

// ---------------------------------------------------------------------------
// deal with command line arguments
assert (argc == 6);
std::string semester = argv[1];
std::string course = argv[2];
std::string gradeable = argv[3];
assert (argv[4] == std::string("--window"));
int window = std::stoi(std::string(argv[5]));
assert (window >= 1);
assert (argc == 2);
std::string config_file = argv[1];

std::ifstream istr(config_file.c_str());
assert (istr.good());
nlohmann::json config_file_json = nlohmann::json::parse(istr);

std::string semester = config_file_json.value("semester","ERROR");
std::string course = config_file_json.value("course","ERROR");
std::string gradeable = config_file_json.value("gradeable","ERROR");
std::string sequence_length_str = config_file_json.value("sequence_length","1");
int sequence_length = std::stoi(sequence_length_str);
std::string threshold_str = config_file_json.value("threshold","5");
int threshold = std::stoi(threshold_str);

assert (sequence_length >= 1);
assert (threshold >= 2);

// error checking, confirm there are hashes to work with
std::string tmp = "/var/local/submitty/courses/"+semester+"/"+course+"/lichen/hashes/"+gradeable;
Expand Down Expand Up @@ -180,6 +190,8 @@ int main(int argc, char* argv[]) {
}
}

std::cout << "finished loading" << std::endl;

// ---------------------------------------------------------------------------

// label the parts of the file that are common to many
Expand All @@ -194,21 +206,26 @@ int main(int argc, char* argv[]) {
// user,version -> ( position -> ( other user,version -> std::vector<Sequence> ) )
std::map<Submission,std::map<int,std::map<Submission, std::vector<Sequence> > > > suspicious;

int my_counter = 0;

// ---------------------------------------------------------------------------
// walk over the structure containing all of the hashes identifying
// common to many/all, provided code, suspicious matches, and unique code
for (hashed_sequences::iterator itr = hash_counts.begin(); itr != hash_counts.end(); itr++) {
int count = itr->second.size();

if (count >= 20) {
my_counter++;

std::cout << "hash walk " << hash_counts.size() << " " << my_counter << std::endl;

if (count > threshold) {
// common to many/all
for (std::map<std::string,std::vector<Sequence> >::iterator itr2 = itr->second.begin(); itr2 != itr->second.end(); itr2++) {
for (int i = 0; i < itr2->second.size(); i++) {
common[itr2->second[i].submission].insert(itr2->second[i].position);
}
}
} else if (count > 1 && count < 20) {
} else if (count > 1 && count <= threshold) {
// suspicious matches
for (std::map<std::string,std::vector<Sequence> >::iterator itr2 = itr->second.begin(); itr2 != itr->second.end(); itr2++) {
std::string username = itr2->first;
Expand All @@ -234,6 +251,7 @@ int main(int argc, char* argv[]) {
}
}

std::cout << "finished walking" << std::endl;

// ---------------------------------------------------------------------------
// prepare a sorted list of all users sorted by match percent
Expand Down
19 changes: 15 additions & 4 deletions tokenizer/plaintext/plaintext_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,19 @@ void usage(const std::string &program) {
}


void deal_with_number(std::map<std::string,nlohmann::json>& tmp, const std::string& token) {
try {
// normal case, convert to integer
tmp["type"]="number";
tmp["value"]=std::stoi(token);
} catch (...) {
// if conversion fails (integer too big!)
tmp["type"]="string";
tmp["value"]=token;
}
}


int main(int argc, char* argv[]) {

// ------------------------------
Expand Down Expand Up @@ -72,8 +85,7 @@ int main(int argc, char* argv[]) {
tmp["char"]=start_col;
if (last_was_digit) {
assert (!last_was_alpha);
tmp["type"]="number";
tmp["value"]=std::stoi(token);
deal_with_number(tmp,token);
} else {
assert (last_was_alpha);
tmp["type"]="string";
Expand Down Expand Up @@ -171,8 +183,7 @@ int main(int argc, char* argv[]) {
tmp["char"]=start_col;
if (last_was_digit) {
assert (!last_was_alpha);
tmp["type"]="number";
tmp["value"]=std::stoi(token);
deal_with_number(tmp,token);
} else {
assert (last_was_alpha);
tmp["type"]="string";
Expand Down

0 comments on commit 1c1dd67

Please sign in to comment.