Skip to content

Commit

Permalink
PUBDEV_4110-find-intermittents: (h2oai#972)
Browse files Browse the repository at this point in the history
- changed run.py to show failed test name correctly;
	- added scrapeForIntermittents.py to gather failed tests information if a Jenkins job failed.
	- added summarizeIntermittents.py to gather all failed tests and collect tests info that failed too often as intermittents.

PUBDEV_4110-find-intermittents: changed time from timestamp to string time.  Printed out most recent failure for intermittents per Michal code review suggestion.

PUBDEV_4110-find-intermittents: Add timezone info.
  • Loading branch information
wendycwong authored Mar 29, 2017
1 parent c47a938 commit 1b78171
Show file tree
Hide file tree
Showing 3 changed files with 570 additions and 3 deletions.
7 changes: 4 additions & 3 deletions scripts/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -1814,7 +1814,7 @@ def _report_xunit_result(self, testsuite_name, testcase_name, testcase_runtime,
failure_message += "\n\n"
failure_message += "#" * 83 + "\n"
failure_message += "########### Problems encountered extracting Java messages. " \
"Please alert the QA team.\n"
"Massive Jenkins or test failure.\n"
failure_message += "#" * 83 + "\n\n"

if failure_message:
Expand All @@ -1824,13 +1824,14 @@ def _report_xunit_result(self, testsuite_name, testcase_name, testcase_runtime,
else:
failure = ""

# fixed problem with test name repeated in Jenkins job test report.
xml_report = """<?xml version="1.0" encoding="UTF-8"?>
<testsuite name="{testsuiteName}" tests="1" errors="{errors}" failures="{failures}" skip="{skip}">
<testcase classname="{testcaseClassName}" name="{testcaseName}" time="{testcaseRuntime}">
<testcase name="{testcaseName}" time="{testcaseRuntime}">
{failure}
</testcase>
</testsuite>
""".format(testsuiteName=testsuite_name, testcaseClassName=testcase_name, testcaseName=testcase_name,
""".format(testsuiteName=testsuite_name, testcaseName=testcase_name,
testcaseRuntime=testcase_runtime, failure=failure,
errors=errors, failures=failures, skip=skip)

Expand Down
358 changes: 358 additions & 0 deletions scripts/scrapeForIntermittents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,358 @@
#!/usr/bin/python

import sys
import os
import json
import subprocess
import time
import datetime
from pytz import timezone
from dateutil import parser

"""
This script will be invoked if it is included in the post-build action of an jenkin job and the job has failed.
It will perform the following tasks:
1. attach the failure informaiton of all tests in the current build to a summary file including the following fields:
- timestamp, jenkin_job_name, build_id, git_has, node_name, build_failure(test failed but to build failure),
JUnit/PyUnit/RUnit/Hadoop, testName.
2. save the above summary file to s3 somewhere using the command: s3cmd put "$TEST_OUTPUT_FILE" s3://ai.h2o.tests/jenkins/
3. store the failed test info in a dictionary and save it to s3 as well;
4. for failed tests, save the txt failed test results to aid the debugging process. Attach timestamp to file name
in order to aid the process of cleaning out the file directory with obsolete files.
"""

# --------------------------------------------------------------------
# Main program
# --------------------------------------------------------------------

g_test_root_dir = os.path.dirname(os.path.realpath(__file__)) # directory where we are running out code from
g_script_name = '' # store script name.
g_timestamp = ''
g_job_name = ''
g_build_id = ''
g_git_hash = ''
g_node_name = ''
g_unit_test_type = ''
g_jenkins_url = ''
g_temp_filename = os.path.join(g_test_root_dir,'tempText') # temp file to store data curled from Jenkins
g_failed_testnames = []
g_failed_test_paths = []
g_failed_tests_dict = '' # contains the filename that contains failed tests info in a dictionary
g_failed_tests_info_dict = dict() # store failed tests info in a dictionary
g_resource_url = ''
g_timestring = ''

def init_failed_tests_dict():
"""
initialize the fields of dictionary storing failed tests.
:return:
"""
global g_failed_test_info_dict
g_failed_tests_info_dict["TestName"] = []
g_failed_tests_info_dict["TestInfo"] = []


def init_update_each_failed_test_dict(one_test_info, failed_test_path, testName, newTest):
"""
For each test, a dictionary structure will be built to record the various info about that test's failure
information. In particular, for each failed tests, there will be a dictionary associated with that test
stored in the field "TestInfo" of g_faiiled_tests_info_dict. The following fields are included:
"JenkinsJobName": job name
"BuildID"
"Timestamp": in seconds
"GitHash"
"TestCategory": JUnit, PyUnit, RUnit or HadoopPyUnit, HadoopRUnit
"NodeName": name of machine that the job was run on
"FailureCount": integer counting number of times this particular test has failed. An intermittent can be
determined as any test with FailureCount >= 2.
"FailureMessages": contains failure messages for the test
:return: a new dict for that test
"""
if newTest:
one_test_info = dict()
one_test_info["JenkinsJobName"] = []
one_test_info["BuildID"] = []
one_test_info["Timestamp"] = []
one_test_info["GitHash"] = []
one_test_info["TestCategory"] = [] # would be JUnit, PyUnit, RUnit or HadoopPyUnit, HadoopRUnit
one_test_info["NodeName"] = []
one_test_info["FailureMessages"] = [] # contains failure messages for the test
one_test_info["FailureCount"] = 0
one_test_info["TestName"] = testName

# if g_timestamp not in one_test_info["Timestamp"]:
one_test_info["JenkinsJobName"].append(g_job_name)
one_test_info["BuildID"].append(g_build_id)
one_test_info["Timestamp"].append(g_timestamp)
one_test_info["GitHash"].append(g_git_hash)
one_test_info["TestCategory"].append(g_unit_test_type) # would be JUnit, PyUnit, RUnit or HadoopPyUnit, HadoopRUnit
one_test_info["NodeName"].append(g_node_name)
one_test_info["FailureCount"] += 1

error_url = '/'.join([g_resource_url, 'testReport', failed_test_path])
get_console_out(error_url) # store failure message in temp file

if os.path.isfile(g_temp_filename):
with open(g_temp_filename, 'r') as error_file:
one_test_info["FailureMessages"].append(error_file.read())
else:
one_test_info["FailureMessages"].append("") # append empty error message if file not found
return one_test_info

def usage():
"""
Print USAGE help.
"""
print("")
print("Usage: ")
print("python scrapeForIntermittents timestamp job_name build_id git_sha node_name unit_test_category jenkins_URL "
"output_filename output_dict_name month_of_data_to_keep")
print(" The unit_test_category can be 'junit', 'pyunit' or 'runit'.")
print(" The ouput_dict_name is the filename that we will save a dictionary structure of the failed unit tests.")
print(" The month_of_data_to_keep is an integer indicating how many months that we want to kee the data "
"starting from now. Any data that is older than the value will be deleted.")

'''
This function is written to extract the console output that has already been stored
in a text file in a remote place and saved it in a local directory that we have accessed
to. We want to be able to read in the local text file and proces it.
'''
def get_console_out(url_string):
"""
Grab the console output from Jenkins and save the content into a temp file
(g_temp_filename). From the saved text file, we can grab the names of
failed tests.
Parameters
----------
url_string : str
contains information on the jenkins job whose console output we are interested in. It is in the context
of resource_url/job/job_name/build_id/testReport/
:return: none
"""
full_command = 'curl ' + '"'+ url_string +'"'+ ' --user '+'"admin:admin"'+' > ' + g_temp_filename
subprocess.call(full_command,shell=True)


def extract_failed_tests_info():
"""
This method will scrape the console output for pyunit,runit and hadoop runs and grab the list of failed tests
and their corresponding paths so that the test execution summary can be located later.
:return: none
"""
global g_failed_testnames
global g_failed_test_paths

if os.path.isfile(g_temp_filename):
console_file = open(g_temp_filename,'r') # open temp file that stored jenkins job console output
try:
for each_line in console_file: # go through each line of console output to extract build ID, data/time ...
each_line.strip()
print(each_line)
if ("Test Result" in each_line) and ("failure" in each_line): # the next few lines will contain failed tests
temp = each_line.split("testReport")
if ("Test Result" in temp[1]) and ("failure" in temp[1]): # grab number of failed tests
tempCount = int(temp[1].split("</a>")[1].split(" ")[0].split("(")[1])

if isinstance(tempCount, int) and tempCount > 0: # temp[1], temp[2],... should contain failed tests
for findex in range(2,len(temp)):
tempMess = temp[findex].split(">")
g_failed_test_paths.append(tempMess[0].strip('"'))
ftestname = tempMess[1].strip("</a")
nameLen = len(ftestname)
true_testname = ftestname[8:nameLen] if 'r_suite.' in ftestname else ftestname
g_failed_testnames.append(true_testname)
break # done. Only one spot contains failed test info.
finally:
console_file.close()

def save_failed_tests_info():
"""
Given the failed tests information in g_failed_testnames, add the new failed test to
text file. In addition, it will update the dictionary that stores all failed test info as well.
:return: None
"""
global g_failed_tests_info_dict
if len(g_failed_testnames) > 0: # found failed test
if os.path.isfile(g_failed_tests_dict) and os.path.getsize(g_failed_tests_dict) > 10:
try:
g_failed_tests_info_dict=json.load(open(g_failed_tests_dict, 'r'))
except:
init_failed_tests_dict()

# with open(g_failed_tests_dict, 'rb') as dict_file:
# g_failed_tests_info_dict = pickle.load(dict_file)
else: # file not found, create new dict
init_failed_tests_dict()

with open(g_summary_text_filename, 'a') as failed_file:
for index in range(len(g_failed_testnames)):

testInfo = ','.join([g_timestring, g_job_name, str(g_build_id), g_git_hash, g_node_name,
g_unit_test_type, g_failed_testnames[index]])
failed_file.write(testInfo)
failed_file.write('\n')
# update failed tests dictionary
update_failed_test_info_dict(g_failed_testnames[index], g_failed_test_paths[index])
json.dump(g_failed_tests_info_dict, open(g_failed_tests_dict, 'w'))
# with open(g_failed_tests_dict, 'wb') as error_file:
# pickle.dump(g_failed_tests_info_dict, error_file)


def update_failed_test_info_dict(failed_testname, failed_test_path):
"""
Update the dictionary structure that stores failed unit test information.
:param failed_testname: string containing name of failed test.
:param failed_test_path: string containing the path to failed test url.
:return: None
"""
global g_failed_tests_info_dict

if failed_testname in g_failed_tests_info_dict["TestName"]: # existing test
g_failed_tests_info_dict["TestInfo"][g_failed_tests_info_dict["TestName"].index(failed_testname)] = \
init_update_each_failed_test_dict(
g_failed_tests_info_dict["TestInfo"][g_failed_tests_info_dict["TestName"].index(failed_testname)],
failed_test_path, failed_testname, False)
else: # next test
g_failed_tests_info_dict["TestName"].append(failed_testname)
g_failed_tests_info_dict["TestInfo"].append(init_update_each_failed_test_dict(dict(), failed_test_path,
failed_testname, True))

def trim_data_back_to(monthToKeep):
"""
This method will remove data from the summary text file and the dictionary file for tests that occurs before
the number of months specified by monthToKeep.
:param monthToKeep:
:return:
"""
global g_failed_tests_info_dict
current_time = time.time() # unit in seconds

oldest_time_allowed = current_time - monthToKeep*30*24*3600 # in seconds

clean_up_failed_test_dict(oldest_time_allowed)
clean_up_summary_text(oldest_time_allowed)


def clean_up_failed_test_dict(oldest_time_allowed):
# read in data from dictionary file
global g_failed_tests_info_dict
if os.path.isfile(g_failed_tests_dict) and os.path.getsize(g_failed_tests_dict)>10:
try:
g_failed_tests_info_dict=json.load(open(g_failed_tests_dict,'r'))
test_index = 0
while test_index < len(g_failed_tests_info_dict["TestName"]):
test_dicts = g_failed_tests_info_dict["TestInfo"][test_index] # a list of dictionary

dict_index = 0
while (len(test_dicts["Timestamp"]) > 0) and (dict_index < len(test_dicts["Timestamp"])):
if (test_dicts["Timestamp"][dict_index] < oldest_time_allowed):
del test_dicts["JenkinsJobName"][dict_index]
del test_dicts["BuildID"][dict_index]
del test_dicts["Timestamp"][dict_index]
del test_dicts["GitHash"][dict_index]
del test_dicts["TestCategory"][dict_index]
del test_dicts["NodeName"][dict_index]
test_dicts["FailureCount"] -= 1
else:
dict_index = dict_index+1

if test_dicts["FailureCount"] <= 0: # remove test name with 0 counts of failure count
del g_failed_tests_info_dict["Testname"][test_index]
del g_failed_tests_info_dict["TestInfo"][test_index]
else:
test_index = test_index+1

json.dump(g_failed_tests_info_dict, open(g_failed_tests_dict, 'w'))
except:
pass


def clean_up_summary_text(oldest_time_allowed):
# clean up the summary text data
if os.path.isfile(g_summary_text_filename):
with open(g_summary_text_filename, 'r') as text_file:
with open(g_temp_filename, 'w') as temp_file:
for each_line in text_file:
temp = each_line.split(',')
if len(temp) >= 7:
dateObj = parser.parse(temp[0]).timetuple()
timestamp = time.mktime(dateObj)

if (timestamp > oldest_time_allowed):
temp_file.write(each_line)

with open(g_summary_text_filename, 'w') as text_file: # write content back to original summary file
with open(g_temp_filename, 'r') as temp_file:
text_file.write(temp_file.read())


def main(argv):
"""
Main program. Expect script name plus 7 inputs in the following order:
- This script name
1. timestamp: time in s
2. jenkins_job_name (JOB_NAME)
3. build_id (BUILD_ID)
4. git hash (GIT_COMMIT)
5. node name (NODE_NAME)
6. unit test category (JUnit, PyUnit, RUnit, Hadoop)
7. Jenkins URL (JENKINS_URL)
8. Text file name where failure summaries are stored
9. Filename that stored all failed test info as a dictionary
10. duration (month) to keep data: data older tghan this input will be removed
@return: none
"""
global g_script_name
global g_test_root_dir
global g_timestamp
global g_job_name
global g_build_id
global g_git_hash
global g_node_name
global g_unit_test_type
global g_jenkins_url
global g_temp_filename
global g_summary_text_filename # store failed test info in csv format
global g_failed_tests_dict # store failed test info as a dictionary
global g_resource_url
global g_timestring

if len(argv) < 11:
print "Wrong call. Not enough arguments.\n"
usage()
sys.exit(1)
else: # we may be in business
g_script_name = os.path.basename(argv[0]) # get name of script being run.
g_timestamp = float(argv[1])
g_job_name = argv[2]
g_build_id = argv[3]
g_git_hash = argv[4]
g_node_name= argv[5]
g_unit_test_type = argv[6]
g_jenkins_url = argv[7]

localtz = time.tzname[0]
dt = parser.parse(time.ctime(g_timestamp)+ ' '+localtz)
g_timestring = dt.strftime("%a %b %d %H:%M:%S %Y %Z")
g_temp_filename = os.path.join(g_test_root_dir,'tempText')
g_summary_text_filename = os.path.join(g_test_root_dir, argv[8])
g_failed_tests_dict = os.path.join(g_test_root_dir, argv[9])
monthToKeep = float(argv[10])

g_resource_url = '/'.join([g_jenkins_url, "job", g_job_name, g_build_id])
get_console_out(g_resource_url+"/#showFailuresLink/") # save remote console output in local directory
extract_failed_tests_info() # grab the console text and stored the failed tests/paths
save_failed_tests_info() # save new failed test info into a file
if monthToKeep > 0:
trim_data_back_to(monthToKeep) # remove data that are too old to save space

if __name__ == "__main__":
main(sys.argv)
Loading

0 comments on commit 1b78171

Please sign in to comment.