forked from h2oai/h2o-3
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
PUBDEV_4110-find-intermittents: (h2oai#972)
- changed run.py to show failed test name correctly; - added scrapeForIntermittents.py to gather failed tests information if a Jenkins job failed. - added summarizeIntermittents.py to gather all failed tests and collect tests info that failed too often as intermittents. PUBDEV_4110-find-intermittents: changed time from timestamp to string time. Printed out most recent failure for intermittents per Michal code review suggestion. PUBDEV_4110-find-intermittents: Add timezone info.
- Loading branch information
1 parent
c47a938
commit 1b78171
Showing
3 changed files
with
570 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,358 @@ | ||
#!/usr/bin/python | ||
|
||
import sys | ||
import os | ||
import json | ||
import subprocess | ||
import time | ||
import datetime | ||
from pytz import timezone | ||
from dateutil import parser | ||
|
||
""" | ||
This script will be invoked if it is included in the post-build action of an jenkin job and the job has failed. | ||
It will perform the following tasks: | ||
1. attach the failure informaiton of all tests in the current build to a summary file including the following fields: | ||
- timestamp, jenkin_job_name, build_id, git_has, node_name, build_failure(test failed but to build failure), | ||
JUnit/PyUnit/RUnit/Hadoop, testName. | ||
2. save the above summary file to s3 somewhere using the command: s3cmd put "$TEST_OUTPUT_FILE" s3://ai.h2o.tests/jenkins/ | ||
3. store the failed test info in a dictionary and save it to s3 as well; | ||
4. for failed tests, save the txt failed test results to aid the debugging process. Attach timestamp to file name | ||
in order to aid the process of cleaning out the file directory with obsolete files. | ||
""" | ||
|
||
# -------------------------------------------------------------------- | ||
# Main program | ||
# -------------------------------------------------------------------- | ||
|
||
g_test_root_dir = os.path.dirname(os.path.realpath(__file__)) # directory where we are running out code from | ||
g_script_name = '' # store script name. | ||
g_timestamp = '' | ||
g_job_name = '' | ||
g_build_id = '' | ||
g_git_hash = '' | ||
g_node_name = '' | ||
g_unit_test_type = '' | ||
g_jenkins_url = '' | ||
g_temp_filename = os.path.join(g_test_root_dir,'tempText') # temp file to store data curled from Jenkins | ||
g_failed_testnames = [] | ||
g_failed_test_paths = [] | ||
g_failed_tests_dict = '' # contains the filename that contains failed tests info in a dictionary | ||
g_failed_tests_info_dict = dict() # store failed tests info in a dictionary | ||
g_resource_url = '' | ||
g_timestring = '' | ||
|
||
def init_failed_tests_dict(): | ||
""" | ||
initialize the fields of dictionary storing failed tests. | ||
:return: | ||
""" | ||
global g_failed_test_info_dict | ||
g_failed_tests_info_dict["TestName"] = [] | ||
g_failed_tests_info_dict["TestInfo"] = [] | ||
|
||
|
||
def init_update_each_failed_test_dict(one_test_info, failed_test_path, testName, newTest): | ||
""" | ||
For each test, a dictionary structure will be built to record the various info about that test's failure | ||
information. In particular, for each failed tests, there will be a dictionary associated with that test | ||
stored in the field "TestInfo" of g_faiiled_tests_info_dict. The following fields are included: | ||
"JenkinsJobName": job name | ||
"BuildID" | ||
"Timestamp": in seconds | ||
"GitHash" | ||
"TestCategory": JUnit, PyUnit, RUnit or HadoopPyUnit, HadoopRUnit | ||
"NodeName": name of machine that the job was run on | ||
"FailureCount": integer counting number of times this particular test has failed. An intermittent can be | ||
determined as any test with FailureCount >= 2. | ||
"FailureMessages": contains failure messages for the test | ||
:return: a new dict for that test | ||
""" | ||
if newTest: | ||
one_test_info = dict() | ||
one_test_info["JenkinsJobName"] = [] | ||
one_test_info["BuildID"] = [] | ||
one_test_info["Timestamp"] = [] | ||
one_test_info["GitHash"] = [] | ||
one_test_info["TestCategory"] = [] # would be JUnit, PyUnit, RUnit or HadoopPyUnit, HadoopRUnit | ||
one_test_info["NodeName"] = [] | ||
one_test_info["FailureMessages"] = [] # contains failure messages for the test | ||
one_test_info["FailureCount"] = 0 | ||
one_test_info["TestName"] = testName | ||
|
||
# if g_timestamp not in one_test_info["Timestamp"]: | ||
one_test_info["JenkinsJobName"].append(g_job_name) | ||
one_test_info["BuildID"].append(g_build_id) | ||
one_test_info["Timestamp"].append(g_timestamp) | ||
one_test_info["GitHash"].append(g_git_hash) | ||
one_test_info["TestCategory"].append(g_unit_test_type) # would be JUnit, PyUnit, RUnit or HadoopPyUnit, HadoopRUnit | ||
one_test_info["NodeName"].append(g_node_name) | ||
one_test_info["FailureCount"] += 1 | ||
|
||
error_url = '/'.join([g_resource_url, 'testReport', failed_test_path]) | ||
get_console_out(error_url) # store failure message in temp file | ||
|
||
if os.path.isfile(g_temp_filename): | ||
with open(g_temp_filename, 'r') as error_file: | ||
one_test_info["FailureMessages"].append(error_file.read()) | ||
else: | ||
one_test_info["FailureMessages"].append("") # append empty error message if file not found | ||
return one_test_info | ||
|
||
def usage(): | ||
""" | ||
Print USAGE help. | ||
""" | ||
print("") | ||
print("Usage: ") | ||
print("python scrapeForIntermittents timestamp job_name build_id git_sha node_name unit_test_category jenkins_URL " | ||
"output_filename output_dict_name month_of_data_to_keep") | ||
print(" The unit_test_category can be 'junit', 'pyunit' or 'runit'.") | ||
print(" The ouput_dict_name is the filename that we will save a dictionary structure of the failed unit tests.") | ||
print(" The month_of_data_to_keep is an integer indicating how many months that we want to kee the data " | ||
"starting from now. Any data that is older than the value will be deleted.") | ||
|
||
''' | ||
This function is written to extract the console output that has already been stored | ||
in a text file in a remote place and saved it in a local directory that we have accessed | ||
to. We want to be able to read in the local text file and proces it. | ||
''' | ||
def get_console_out(url_string): | ||
""" | ||
Grab the console output from Jenkins and save the content into a temp file | ||
(g_temp_filename). From the saved text file, we can grab the names of | ||
failed tests. | ||
Parameters | ||
---------- | ||
url_string : str | ||
contains information on the jenkins job whose console output we are interested in. It is in the context | ||
of resource_url/job/job_name/build_id/testReport/ | ||
:return: none | ||
""" | ||
full_command = 'curl ' + '"'+ url_string +'"'+ ' --user '+'"admin:admin"'+' > ' + g_temp_filename | ||
subprocess.call(full_command,shell=True) | ||
|
||
|
||
def extract_failed_tests_info(): | ||
""" | ||
This method will scrape the console output for pyunit,runit and hadoop runs and grab the list of failed tests | ||
and their corresponding paths so that the test execution summary can be located later. | ||
:return: none | ||
""" | ||
global g_failed_testnames | ||
global g_failed_test_paths | ||
|
||
if os.path.isfile(g_temp_filename): | ||
console_file = open(g_temp_filename,'r') # open temp file that stored jenkins job console output | ||
try: | ||
for each_line in console_file: # go through each line of console output to extract build ID, data/time ... | ||
each_line.strip() | ||
print(each_line) | ||
if ("Test Result" in each_line) and ("failure" in each_line): # the next few lines will contain failed tests | ||
temp = each_line.split("testReport") | ||
if ("Test Result" in temp[1]) and ("failure" in temp[1]): # grab number of failed tests | ||
tempCount = int(temp[1].split("</a>")[1].split(" ")[0].split("(")[1]) | ||
|
||
if isinstance(tempCount, int) and tempCount > 0: # temp[1], temp[2],... should contain failed tests | ||
for findex in range(2,len(temp)): | ||
tempMess = temp[findex].split(">") | ||
g_failed_test_paths.append(tempMess[0].strip('"')) | ||
ftestname = tempMess[1].strip("</a") | ||
nameLen = len(ftestname) | ||
true_testname = ftestname[8:nameLen] if 'r_suite.' in ftestname else ftestname | ||
g_failed_testnames.append(true_testname) | ||
break # done. Only one spot contains failed test info. | ||
finally: | ||
console_file.close() | ||
|
||
def save_failed_tests_info(): | ||
""" | ||
Given the failed tests information in g_failed_testnames, add the new failed test to | ||
text file. In addition, it will update the dictionary that stores all failed test info as well. | ||
:return: None | ||
""" | ||
global g_failed_tests_info_dict | ||
if len(g_failed_testnames) > 0: # found failed test | ||
if os.path.isfile(g_failed_tests_dict) and os.path.getsize(g_failed_tests_dict) > 10: | ||
try: | ||
g_failed_tests_info_dict=json.load(open(g_failed_tests_dict, 'r')) | ||
except: | ||
init_failed_tests_dict() | ||
|
||
# with open(g_failed_tests_dict, 'rb') as dict_file: | ||
# g_failed_tests_info_dict = pickle.load(dict_file) | ||
else: # file not found, create new dict | ||
init_failed_tests_dict() | ||
|
||
with open(g_summary_text_filename, 'a') as failed_file: | ||
for index in range(len(g_failed_testnames)): | ||
|
||
testInfo = ','.join([g_timestring, g_job_name, str(g_build_id), g_git_hash, g_node_name, | ||
g_unit_test_type, g_failed_testnames[index]]) | ||
failed_file.write(testInfo) | ||
failed_file.write('\n') | ||
# update failed tests dictionary | ||
update_failed_test_info_dict(g_failed_testnames[index], g_failed_test_paths[index]) | ||
json.dump(g_failed_tests_info_dict, open(g_failed_tests_dict, 'w')) | ||
# with open(g_failed_tests_dict, 'wb') as error_file: | ||
# pickle.dump(g_failed_tests_info_dict, error_file) | ||
|
||
|
||
def update_failed_test_info_dict(failed_testname, failed_test_path): | ||
""" | ||
Update the dictionary structure that stores failed unit test information. | ||
:param failed_testname: string containing name of failed test. | ||
:param failed_test_path: string containing the path to failed test url. | ||
:return: None | ||
""" | ||
global g_failed_tests_info_dict | ||
|
||
if failed_testname in g_failed_tests_info_dict["TestName"]: # existing test | ||
g_failed_tests_info_dict["TestInfo"][g_failed_tests_info_dict["TestName"].index(failed_testname)] = \ | ||
init_update_each_failed_test_dict( | ||
g_failed_tests_info_dict["TestInfo"][g_failed_tests_info_dict["TestName"].index(failed_testname)], | ||
failed_test_path, failed_testname, False) | ||
else: # next test | ||
g_failed_tests_info_dict["TestName"].append(failed_testname) | ||
g_failed_tests_info_dict["TestInfo"].append(init_update_each_failed_test_dict(dict(), failed_test_path, | ||
failed_testname, True)) | ||
|
||
def trim_data_back_to(monthToKeep): | ||
""" | ||
This method will remove data from the summary text file and the dictionary file for tests that occurs before | ||
the number of months specified by monthToKeep. | ||
:param monthToKeep: | ||
:return: | ||
""" | ||
global g_failed_tests_info_dict | ||
current_time = time.time() # unit in seconds | ||
|
||
oldest_time_allowed = current_time - monthToKeep*30*24*3600 # in seconds | ||
|
||
clean_up_failed_test_dict(oldest_time_allowed) | ||
clean_up_summary_text(oldest_time_allowed) | ||
|
||
|
||
def clean_up_failed_test_dict(oldest_time_allowed): | ||
# read in data from dictionary file | ||
global g_failed_tests_info_dict | ||
if os.path.isfile(g_failed_tests_dict) and os.path.getsize(g_failed_tests_dict)>10: | ||
try: | ||
g_failed_tests_info_dict=json.load(open(g_failed_tests_dict,'r')) | ||
test_index = 0 | ||
while test_index < len(g_failed_tests_info_dict["TestName"]): | ||
test_dicts = g_failed_tests_info_dict["TestInfo"][test_index] # a list of dictionary | ||
|
||
dict_index = 0 | ||
while (len(test_dicts["Timestamp"]) > 0) and (dict_index < len(test_dicts["Timestamp"])): | ||
if (test_dicts["Timestamp"][dict_index] < oldest_time_allowed): | ||
del test_dicts["JenkinsJobName"][dict_index] | ||
del test_dicts["BuildID"][dict_index] | ||
del test_dicts["Timestamp"][dict_index] | ||
del test_dicts["GitHash"][dict_index] | ||
del test_dicts["TestCategory"][dict_index] | ||
del test_dicts["NodeName"][dict_index] | ||
test_dicts["FailureCount"] -= 1 | ||
else: | ||
dict_index = dict_index+1 | ||
|
||
if test_dicts["FailureCount"] <= 0: # remove test name with 0 counts of failure count | ||
del g_failed_tests_info_dict["Testname"][test_index] | ||
del g_failed_tests_info_dict["TestInfo"][test_index] | ||
else: | ||
test_index = test_index+1 | ||
|
||
json.dump(g_failed_tests_info_dict, open(g_failed_tests_dict, 'w')) | ||
except: | ||
pass | ||
|
||
|
||
def clean_up_summary_text(oldest_time_allowed): | ||
# clean up the summary text data | ||
if os.path.isfile(g_summary_text_filename): | ||
with open(g_summary_text_filename, 'r') as text_file: | ||
with open(g_temp_filename, 'w') as temp_file: | ||
for each_line in text_file: | ||
temp = each_line.split(',') | ||
if len(temp) >= 7: | ||
dateObj = parser.parse(temp[0]).timetuple() | ||
timestamp = time.mktime(dateObj) | ||
|
||
if (timestamp > oldest_time_allowed): | ||
temp_file.write(each_line) | ||
|
||
with open(g_summary_text_filename, 'w') as text_file: # write content back to original summary file | ||
with open(g_temp_filename, 'r') as temp_file: | ||
text_file.write(temp_file.read()) | ||
|
||
|
||
def main(argv): | ||
""" | ||
Main program. Expect script name plus 7 inputs in the following order: | ||
- This script name | ||
1. timestamp: time in s | ||
2. jenkins_job_name (JOB_NAME) | ||
3. build_id (BUILD_ID) | ||
4. git hash (GIT_COMMIT) | ||
5. node name (NODE_NAME) | ||
6. unit test category (JUnit, PyUnit, RUnit, Hadoop) | ||
7. Jenkins URL (JENKINS_URL) | ||
8. Text file name where failure summaries are stored | ||
9. Filename that stored all failed test info as a dictionary | ||
10. duration (month) to keep data: data older tghan this input will be removed | ||
@return: none | ||
""" | ||
global g_script_name | ||
global g_test_root_dir | ||
global g_timestamp | ||
global g_job_name | ||
global g_build_id | ||
global g_git_hash | ||
global g_node_name | ||
global g_unit_test_type | ||
global g_jenkins_url | ||
global g_temp_filename | ||
global g_summary_text_filename # store failed test info in csv format | ||
global g_failed_tests_dict # store failed test info as a dictionary | ||
global g_resource_url | ||
global g_timestring | ||
|
||
if len(argv) < 11: | ||
print "Wrong call. Not enough arguments.\n" | ||
usage() | ||
sys.exit(1) | ||
else: # we may be in business | ||
g_script_name = os.path.basename(argv[0]) # get name of script being run. | ||
g_timestamp = float(argv[1]) | ||
g_job_name = argv[2] | ||
g_build_id = argv[3] | ||
g_git_hash = argv[4] | ||
g_node_name= argv[5] | ||
g_unit_test_type = argv[6] | ||
g_jenkins_url = argv[7] | ||
|
||
localtz = time.tzname[0] | ||
dt = parser.parse(time.ctime(g_timestamp)+ ' '+localtz) | ||
g_timestring = dt.strftime("%a %b %d %H:%M:%S %Y %Z") | ||
g_temp_filename = os.path.join(g_test_root_dir,'tempText') | ||
g_summary_text_filename = os.path.join(g_test_root_dir, argv[8]) | ||
g_failed_tests_dict = os.path.join(g_test_root_dir, argv[9]) | ||
monthToKeep = float(argv[10]) | ||
|
||
g_resource_url = '/'.join([g_jenkins_url, "job", g_job_name, g_build_id]) | ||
get_console_out(g_resource_url+"/#showFailuresLink/") # save remote console output in local directory | ||
extract_failed_tests_info() # grab the console text and stored the failed tests/paths | ||
save_failed_tests_info() # save new failed test info into a file | ||
if monthToKeep > 0: | ||
trim_data_back_to(monthToKeep) # remove data that are too old to save space | ||
|
||
if __name__ == "__main__": | ||
main(sys.argv) |
Oops, something went wrong.