From 1b78171c16be15a469f58436497e424ba2788158 Mon Sep 17 00:00:00 2001 From: wendycwong Date: Wed, 29 Mar 2017 09:53:43 -0700 Subject: [PATCH] PUBDEV_4110-find-intermittents: (#972) - changed run.py to show failed test name correctly; - added scrapeForIntermittents.py to gather failed tests information if a Jenkins job failed. - added summarizeIntermittents.py to gather all failed tests and collect tests info that failed too often as intermittents. PUBDEV_4110-find-intermittents: changed time from timestamp to string time. Printed out most recent failure for intermittents per Michal code review suggestion. PUBDEV_4110-find-intermittents: Add timezone info. --- scripts/run.py | 7 +- scripts/scrapeForIntermittents.py | 358 ++++++++++++++++++++++++++++++ scripts/summarizeIntermittens.py | 208 +++++++++++++++++ 3 files changed, 570 insertions(+), 3 deletions(-) create mode 100755 scripts/scrapeForIntermittents.py create mode 100755 scripts/summarizeIntermittens.py diff --git a/scripts/run.py b/scripts/run.py index 7a3cdffda911..bba4c3ef0fd7 100755 --- a/scripts/run.py +++ b/scripts/run.py @@ -1814,7 +1814,7 @@ def _report_xunit_result(self, testsuite_name, testcase_name, testcase_runtime, failure_message += "\n\n" failure_message += "#" * 83 + "\n" failure_message += "########### Problems encountered extracting Java messages. " \ - "Please alert the QA team.\n" + "Massive Jenkins or test failure.\n" failure_message += "#" * 83 + "\n\n" if failure_message: @@ -1824,13 +1824,14 @@ def _report_xunit_result(self, testsuite_name, testcase_name, testcase_runtime, else: failure = "" + # fixed problem with test name repeated in Jenkins job test report. xml_report = """ - + {failure} -""".format(testsuiteName=testsuite_name, testcaseClassName=testcase_name, testcaseName=testcase_name, +""".format(testsuiteName=testsuite_name, testcaseName=testcase_name, testcaseRuntime=testcase_runtime, failure=failure, errors=errors, failures=failures, skip=skip) diff --git a/scripts/scrapeForIntermittents.py b/scripts/scrapeForIntermittents.py new file mode 100755 index 000000000000..4e0c6d5ea00f --- /dev/null +++ b/scripts/scrapeForIntermittents.py @@ -0,0 +1,358 @@ +#!/usr/bin/python + +import sys +import os +import json +import subprocess +import time +import datetime +from pytz import timezone +from dateutil import parser + +""" +This script will be invoked if it is included in the post-build action of an jenkin job and the job has failed. + +It will perform the following tasks: +1. attach the failure informaiton of all tests in the current build to a summary file including the following fields: + - timestamp, jenkin_job_name, build_id, git_has, node_name, build_failure(test failed but to build failure), + JUnit/PyUnit/RUnit/Hadoop, testName. +2. save the above summary file to s3 somewhere using the command: s3cmd put "$TEST_OUTPUT_FILE" s3://ai.h2o.tests/jenkins/ +3. store the failed test info in a dictionary and save it to s3 as well; +4. for failed tests, save the txt failed test results to aid the debugging process. Attach timestamp to file name + in order to aid the process of cleaning out the file directory with obsolete files. +""" + +# -------------------------------------------------------------------- +# Main program +# -------------------------------------------------------------------- + +g_test_root_dir = os.path.dirname(os.path.realpath(__file__)) # directory where we are running out code from +g_script_name = '' # store script name. +g_timestamp = '' +g_job_name = '' +g_build_id = '' +g_git_hash = '' +g_node_name = '' +g_unit_test_type = '' +g_jenkins_url = '' +g_temp_filename = os.path.join(g_test_root_dir,'tempText') # temp file to store data curled from Jenkins +g_failed_testnames = [] +g_failed_test_paths = [] +g_failed_tests_dict = '' # contains the filename that contains failed tests info in a dictionary +g_failed_tests_info_dict = dict() # store failed tests info in a dictionary +g_resource_url = '' +g_timestring = '' + +def init_failed_tests_dict(): + """ + initialize the fields of dictionary storing failed tests. + :return: + """ + global g_failed_test_info_dict + g_failed_tests_info_dict["TestName"] = [] + g_failed_tests_info_dict["TestInfo"] = [] + + +def init_update_each_failed_test_dict(one_test_info, failed_test_path, testName, newTest): + """ + For each test, a dictionary structure will be built to record the various info about that test's failure + information. In particular, for each failed tests, there will be a dictionary associated with that test + stored in the field "TestInfo" of g_faiiled_tests_info_dict. The following fields are included: + "JenkinsJobName": job name + "BuildID" + "Timestamp": in seconds + "GitHash" + "TestCategory": JUnit, PyUnit, RUnit or HadoopPyUnit, HadoopRUnit + "NodeName": name of machine that the job was run on + "FailureCount": integer counting number of times this particular test has failed. An intermittent can be + determined as any test with FailureCount >= 2. + "FailureMessages": contains failure messages for the test + :return: a new dict for that test + """ + if newTest: + one_test_info = dict() + one_test_info["JenkinsJobName"] = [] + one_test_info["BuildID"] = [] + one_test_info["Timestamp"] = [] + one_test_info["GitHash"] = [] + one_test_info["TestCategory"] = [] # would be JUnit, PyUnit, RUnit or HadoopPyUnit, HadoopRUnit + one_test_info["NodeName"] = [] + one_test_info["FailureMessages"] = [] # contains failure messages for the test + one_test_info["FailureCount"] = 0 + one_test_info["TestName"] = testName + +# if g_timestamp not in one_test_info["Timestamp"]: + one_test_info["JenkinsJobName"].append(g_job_name) + one_test_info["BuildID"].append(g_build_id) + one_test_info["Timestamp"].append(g_timestamp) + one_test_info["GitHash"].append(g_git_hash) + one_test_info["TestCategory"].append(g_unit_test_type) # would be JUnit, PyUnit, RUnit or HadoopPyUnit, HadoopRUnit + one_test_info["NodeName"].append(g_node_name) + one_test_info["FailureCount"] += 1 + + error_url = '/'.join([g_resource_url, 'testReport', failed_test_path]) + get_console_out(error_url) # store failure message in temp file + + if os.path.isfile(g_temp_filename): + with open(g_temp_filename, 'r') as error_file: + one_test_info["FailureMessages"].append(error_file.read()) + else: + one_test_info["FailureMessages"].append("") # append empty error message if file not found + return one_test_info + +def usage(): + """ + Print USAGE help. + """ + print("") + print("Usage: ") + print("python scrapeForIntermittents timestamp job_name build_id git_sha node_name unit_test_category jenkins_URL " + "output_filename output_dict_name month_of_data_to_keep") + print(" The unit_test_category can be 'junit', 'pyunit' or 'runit'.") + print(" The ouput_dict_name is the filename that we will save a dictionary structure of the failed unit tests.") + print(" The month_of_data_to_keep is an integer indicating how many months that we want to kee the data " + "starting from now. Any data that is older than the value will be deleted.") + +''' +This function is written to extract the console output that has already been stored +in a text file in a remote place and saved it in a local directory that we have accessed +to. We want to be able to read in the local text file and proces it. +''' +def get_console_out(url_string): + """ + Grab the console output from Jenkins and save the content into a temp file + (g_temp_filename). From the saved text file, we can grab the names of + failed tests. + + Parameters + ---------- + url_string : str + contains information on the jenkins job whose console output we are interested in. It is in the context + of resource_url/job/job_name/build_id/testReport/ + + :return: none + """ + full_command = 'curl ' + '"'+ url_string +'"'+ ' --user '+'"admin:admin"'+' > ' + g_temp_filename + subprocess.call(full_command,shell=True) + + +def extract_failed_tests_info(): + """ + This method will scrape the console output for pyunit,runit and hadoop runs and grab the list of failed tests + and their corresponding paths so that the test execution summary can be located later. + + :return: none + """ + global g_failed_testnames + global g_failed_test_paths + + if os.path.isfile(g_temp_filename): + console_file = open(g_temp_filename,'r') # open temp file that stored jenkins job console output + try: + for each_line in console_file: # go through each line of console output to extract build ID, data/time ... + each_line.strip() + print(each_line) + if ("Test Result" in each_line) and ("failure" in each_line): # the next few lines will contain failed tests + temp = each_line.split("testReport") + if ("Test Result" in temp[1]) and ("failure" in temp[1]): # grab number of failed tests + tempCount = int(temp[1].split("")[1].split(" ")[0].split("(")[1]) + + if isinstance(tempCount, int) and tempCount > 0: # temp[1], temp[2],... should contain failed tests + for findex in range(2,len(temp)): + tempMess = temp[findex].split(">") + g_failed_test_paths.append(tempMess[0].strip('"')) + ftestname = tempMess[1].strip(" 0: # found failed test + if os.path.isfile(g_failed_tests_dict) and os.path.getsize(g_failed_tests_dict) > 10: + try: + g_failed_tests_info_dict=json.load(open(g_failed_tests_dict, 'r')) + except: + init_failed_tests_dict() + + # with open(g_failed_tests_dict, 'rb') as dict_file: + # g_failed_tests_info_dict = pickle.load(dict_file) + else: # file not found, create new dict + init_failed_tests_dict() + + with open(g_summary_text_filename, 'a') as failed_file: + for index in range(len(g_failed_testnames)): + + testInfo = ','.join([g_timestring, g_job_name, str(g_build_id), g_git_hash, g_node_name, + g_unit_test_type, g_failed_testnames[index]]) + failed_file.write(testInfo) + failed_file.write('\n') + # update failed tests dictionary + update_failed_test_info_dict(g_failed_testnames[index], g_failed_test_paths[index]) + json.dump(g_failed_tests_info_dict, open(g_failed_tests_dict, 'w')) + # with open(g_failed_tests_dict, 'wb') as error_file: + # pickle.dump(g_failed_tests_info_dict, error_file) + + +def update_failed_test_info_dict(failed_testname, failed_test_path): + """ + Update the dictionary structure that stores failed unit test information. + + :param failed_testname: string containing name of failed test. + :param failed_test_path: string containing the path to failed test url. + :return: None + """ + global g_failed_tests_info_dict + + if failed_testname in g_failed_tests_info_dict["TestName"]: # existing test + g_failed_tests_info_dict["TestInfo"][g_failed_tests_info_dict["TestName"].index(failed_testname)] = \ + init_update_each_failed_test_dict( + g_failed_tests_info_dict["TestInfo"][g_failed_tests_info_dict["TestName"].index(failed_testname)], + failed_test_path, failed_testname, False) + else: # next test + g_failed_tests_info_dict["TestName"].append(failed_testname) + g_failed_tests_info_dict["TestInfo"].append(init_update_each_failed_test_dict(dict(), failed_test_path, + failed_testname, True)) + +def trim_data_back_to(monthToKeep): + """ + This method will remove data from the summary text file and the dictionary file for tests that occurs before + the number of months specified by monthToKeep. + + :param monthToKeep: + :return: + """ + global g_failed_tests_info_dict + current_time = time.time() # unit in seconds + + oldest_time_allowed = current_time - monthToKeep*30*24*3600 # in seconds + + clean_up_failed_test_dict(oldest_time_allowed) + clean_up_summary_text(oldest_time_allowed) + + +def clean_up_failed_test_dict(oldest_time_allowed): + # read in data from dictionary file + global g_failed_tests_info_dict + if os.path.isfile(g_failed_tests_dict) and os.path.getsize(g_failed_tests_dict)>10: + try: + g_failed_tests_info_dict=json.load(open(g_failed_tests_dict,'r')) + test_index = 0 + while test_index < len(g_failed_tests_info_dict["TestName"]): + test_dicts = g_failed_tests_info_dict["TestInfo"][test_index] # a list of dictionary + + dict_index = 0 + while (len(test_dicts["Timestamp"]) > 0) and (dict_index < len(test_dicts["Timestamp"])): + if (test_dicts["Timestamp"][dict_index] < oldest_time_allowed): + del test_dicts["JenkinsJobName"][dict_index] + del test_dicts["BuildID"][dict_index] + del test_dicts["Timestamp"][dict_index] + del test_dicts["GitHash"][dict_index] + del test_dicts["TestCategory"][dict_index] + del test_dicts["NodeName"][dict_index] + test_dicts["FailureCount"] -= 1 + else: + dict_index = dict_index+1 + + if test_dicts["FailureCount"] <= 0: # remove test name with 0 counts of failure count + del g_failed_tests_info_dict["Testname"][test_index] + del g_failed_tests_info_dict["TestInfo"][test_index] + else: + test_index = test_index+1 + + json.dump(g_failed_tests_info_dict, open(g_failed_tests_dict, 'w')) + except: + pass + + +def clean_up_summary_text(oldest_time_allowed): + # clean up the summary text data + if os.path.isfile(g_summary_text_filename): + with open(g_summary_text_filename, 'r') as text_file: + with open(g_temp_filename, 'w') as temp_file: + for each_line in text_file: + temp = each_line.split(',') + if len(temp) >= 7: + dateObj = parser.parse(temp[0]).timetuple() + timestamp = time.mktime(dateObj) + + if (timestamp > oldest_time_allowed): + temp_file.write(each_line) + + with open(g_summary_text_filename, 'w') as text_file: # write content back to original summary file + with open(g_temp_filename, 'r') as temp_file: + text_file.write(temp_file.read()) + + +def main(argv): + """ + Main program. Expect script name plus 7 inputs in the following order: + - This script name + 1. timestamp: time in s + 2. jenkins_job_name (JOB_NAME) + 3. build_id (BUILD_ID) + 4. git hash (GIT_COMMIT) + 5. node name (NODE_NAME) + 6. unit test category (JUnit, PyUnit, RUnit, Hadoop) + 7. Jenkins URL (JENKINS_URL) + 8. Text file name where failure summaries are stored + 9. Filename that stored all failed test info as a dictionary + 10. duration (month) to keep data: data older tghan this input will be removed + + @return: none + """ + global g_script_name + global g_test_root_dir + global g_timestamp + global g_job_name + global g_build_id + global g_git_hash + global g_node_name + global g_unit_test_type + global g_jenkins_url + global g_temp_filename + global g_summary_text_filename # store failed test info in csv format + global g_failed_tests_dict # store failed test info as a dictionary + global g_resource_url + global g_timestring + + if len(argv) < 11: + print "Wrong call. Not enough arguments.\n" + usage() + sys.exit(1) + else: # we may be in business + g_script_name = os.path.basename(argv[0]) # get name of script being run. + g_timestamp = float(argv[1]) + g_job_name = argv[2] + g_build_id = argv[3] + g_git_hash = argv[4] + g_node_name= argv[5] + g_unit_test_type = argv[6] + g_jenkins_url = argv[7] + + localtz = time.tzname[0] + dt = parser.parse(time.ctime(g_timestamp)+ ' '+localtz) + g_timestring = dt.strftime("%a %b %d %H:%M:%S %Y %Z") + g_temp_filename = os.path.join(g_test_root_dir,'tempText') + g_summary_text_filename = os.path.join(g_test_root_dir, argv[8]) + g_failed_tests_dict = os.path.join(g_test_root_dir, argv[9]) + monthToKeep = float(argv[10]) + + g_resource_url = '/'.join([g_jenkins_url, "job", g_job_name, g_build_id]) + get_console_out(g_resource_url+"/#showFailuresLink/") # save remote console output in local directory + extract_failed_tests_info() # grab the console text and stored the failed tests/paths + save_failed_tests_info() # save new failed test info into a file + if monthToKeep > 0: + trim_data_back_to(monthToKeep) # remove data that are too old to save space + +if __name__ == "__main__": + main(sys.argv) diff --git a/scripts/summarizeIntermittens.py b/scripts/summarizeIntermittens.py new file mode 100755 index 000000000000..36825ba7f31f --- /dev/null +++ b/scripts/summarizeIntermittens.py @@ -0,0 +1,208 @@ +#!/usr/bin/python + +import sys +import os +from os import listdir +from os.path import isfile, join +import time +import json +import datetime +from pytz import timezone +from dateutil import parser + +""" +This script will summary failed tests results and determine if any of them may be intermittents. For +tests that are determined to be intermittents, a dictionary structure will be generated to store information +about the intermittents. + +Currently, a simple threshold test is used to determine if a test is intermittent. If the failure count of +any test exceed the threshold, we will label it as intermittent. In particular, the following information +will be stored for each intermittent failure: + "JenkinsJobName" + "BuildID" + "Timestamp" + "GitHash" + "TestCategory" + "NodeName" + "FailureMessages" + "FailureCount" + +""" + +# -------------------------------------------------------------------- +# Main program +# -------------------------------------------------------------------- + +g_test_root_dir = os.path.dirname(os.path.realpath(__file__)) # directory where we are running out code from +g_threshold_failure = 0 +g_summary_dict_name = '' +g_file_start = [] + +g_summary_dict_intermittents = dict() +g_summary_dict_all = dict() + +def init_intermittents_dict(init_dict): + """ + initialize the fields of dictionary storing failed tests. + :return: + """ + init_dict["TestName"] = [] + init_dict["TestInfo"] = [] + + +def usage(): + """ + Print USAGE help. + """ + print("") + print("Usage: ") + print("python summarizeINtermittents threshold Filename_for_dict AWS_path Failed_PyUnits_summary_dict_from ....") + print("- threshold is an integer for which a failed test is labeled intermittent if its number of " + "failure exceeds it.") + print("- Filename_for_dict is a string denoting the name of the dictionary that will store the final intermittents.") + print("- Failed_PyUnits_summary_dict_from is a string denoting the beginning of pickle files that contains" + "") + print("- ... denotes extra strings that represent the beginning of pickle files that you want us to summarize" + "for you.") + +def summarizeFailedRuns(): + """ + This function will look at the local directory and pick out files that have the correct start name and + summarize the results into one giant dict. + + :return: None + """ + global g_summary_dict_all + + onlyFiles = [x for x in listdir(g_test_root_dir) if isfile(join(g_test_root_dir, x))] # grab files + + for f in onlyFiles: + for fileStart in g_file_start: + if (fileStart in f) and (os.path.getsize(f) > 10): # found the file containing failed tests + fFullPath = os.path.join(g_test_root_dir, f) + try: + temp_dict = json.load(open(fFullPath,'r')) + + # scrape through temp_dict and see if we need to add the test to intermittents + for ind in range(len(temp_dict["TestName"])): + addFailedTests(g_summary_dict_all, temp_dict, ind) + except: + continue + break + +def addFailedTests(summary_dict, temp_dict, index): + testName = temp_dict["TestName"][index] + testNameList = summary_dict["TestName"] + # check if new intermittents or old ones + if testName in testNameList: + testIndex =testNameList.index(testName) # update the test + updateFailedTestInfo(summary_dict, temp_dict["TestInfo"][index], testIndex, testName, False) + else: # new intermittent uncovered + summary_dict["TestName"].append(testName) + updateFailedTestInfo(summary_dict, temp_dict["TestInfo"][index], len(summary_dict["TestName"])-1, testName, True) + + +def updateFailedTestInfo(summary_dict, one_test_info, testIndex, testName, newTest): + """ + For each test, a dictionary structure will be built to record the various info about that test's failure + information. In particular, for each failed tests, there will be a dictionary associated with that test + stored in the field "TestInfo" of g_faiiled_tests_info_dict. The following fields are included: + "JenkinsJobName": job name + "BuildID" + "Timestamp": in seconds + "GitHash" + "TestCategory": JUnit, PyUnit, RUnit or HadoopPyUnit, HadoopRUnit + "NodeName": name of machine that the job was run on + "FailureCount": integer counting number of times this particular test has failed. An intermittent can be + determined as any test with FailureCount >= 2. + "FailureMessages": contains failure messages for the test + :return: a new dict for that test + """ + if newTest: # setup the dict structure to store the new data + summary_dict["TestInfo"].append(dict()) + summary_dict["TestInfo"][testIndex]["JenkinsJobName"]=[] + summary_dict["TestInfo"][testIndex]["BuildID"]=[] + summary_dict["TestInfo"][testIndex]["Timestamp"]=[] + summary_dict["TestInfo"][testIndex]["GitHash"]=[] + summary_dict["TestInfo"][testIndex]["TestCategory"]=[] + summary_dict["TestInfo"][testIndex]["NodeName"]=[] + summary_dict["TestInfo"][testIndex]["FailureCount"]=0 + summary_dict["TestInfo"][testIndex]["TestName"] = testName # add test name + + summary_dict["TestInfo"][testIndex]["JenkinsJobName"].extend(one_test_info["JenkinsJobName"]) + summary_dict["TestInfo"][testIndex]["BuildID"].extend(one_test_info["BuildID"]) + summary_dict["TestInfo"][testIndex]["Timestamp"].extend(one_test_info["Timestamp"]) + summary_dict["TestInfo"][testIndex]["GitHash"].extend(one_test_info["GitHash"]) + summary_dict["TestInfo"][testIndex]["TestCategory"].extend(one_test_info["TestCategory"]) + summary_dict["TestInfo"][testIndex]["NodeName"].extend(one_test_info["NodeName"]) + summary_dict["TestInfo"][testIndex]["FailureCount"] += one_test_info["FailureCount"] + + +def extractPrintSaveIntermittens(): + """ + This function will print out the intermittents onto the screen for casual viewing. It will also print out + where the giant summary dictionary is going to be stored. + + :return: None + """ + # extract intermittents from collected failed tests + global g_summary_dict_intermittents + + localtz = time.tzname[0] + + + for ind in range(len(g_summary_dict_all["TestName"])): + if g_summary_dict_all["TestInfo"][ind]["FailureCount"] >= g_threshold_failure: + addFailedTests(g_summary_dict_intermittents, g_summary_dict_all, ind) + + for ind in range(len(g_summary_dict_intermittents["TestName"])): + testName = g_summary_dict_intermittents["TestName"][ind] + numberFailure = g_summary_dict_intermittents["TestInfo"][ind]["FailureCount"] + firstFailedTS = parser.parse(time.ctime(min(g_summary_dict_intermittents["TestInfo"][ind]["Timestamp"]))+ ' '+localtz) + firstFailedStr = firstFailedTS.strftime("%a %b %d %H:%M:%S %Y %Z") + recentFail = parser.parse(time.ctime(max(g_summary_dict_intermittents["TestInfo"][ind]["Timestamp"]))+ ' '+localtz) + recentFailStr = recentFail.strftime("%a %b %d %H:%M:%S %Y %Z") + print("Intermittent: {0} last failed at {1} and has failed {2} times since " + "{3}.".format(testName, recentFailStr, numberFailure, firstFailedStr)) + # save dict in file + if len(g_summary_dict_intermittents["TestName"]) > 0: + json.dump(g_summary_dict_intermittents, open(g_summary_dict_name, 'w')) + +def main(argv): + """ + Main program. Expect script name plus inputs in the following order: + - This script name + 1. threshold: integer that will denote when a failed test will be declared an intermittent + 2. string denote filename of where our final dict structure will be stored. + 3. string that denote the beginning of a file containing failed tests info. + 4. Optional strings that denote the beginning of a file containing failed tests info. + + @return: none + """ + global g_script_name + global g_test_root_dir + global g_threshold_failure + global g_file_start + global g_summary_dict_name + global g_summary_dict_all + global g_summary_dict_intermittents + + if len(argv) < 5: + print "Wrong call. Not enough arguments.\n" + usage() + sys.exit(1) + else: # we may be in business + g_threshold_failure = int(argv[1]) + g_summary_dict_name = os.path.join(g_test_root_dir, argv[2]) + + for ind in range(3, len(argv)): + g_file_start.append(argv[ind]) + + init_intermittents_dict(g_summary_dict_all) + init_intermittents_dict(g_summary_dict_intermittents) + summarizeFailedRuns() + extractPrintSaveIntermittens() + + +if __name__ == "__main__": + main(sys.argv)