forked from mirrors/gecko-dev
		
	 9472cb160b
			
		
	
	
		9472cb160b
		
	
	
	
	
		
			
			MozReview-Commit-ID: Hu9ju4XVQpA --HG-- extra : rebase_source : dba8ba385a86ef24a031bca58f01c6ec553d1459
		
			
				
	
	
		
			197 lines
		
	
	
	
		
			8.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			197 lines
		
	
	
	
		
			8.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # This Source Code Form is subject to the terms of the Mozilla Public
 | |
| # License, v. 2.0. If a copy of the MPL was not distributed with this
 | |
| # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 | |
| 
 | |
| from __future__ import absolute_import, print_function, unicode_literals
 | |
| 
 | |
| import json
 | |
| import logging
 | |
| import requests
 | |
| from collections import defaultdict
 | |
| from redo import retry
 | |
| from requests import exceptions
 | |
| 
 | |
| logger = logging.getLogger(__name__)
 | |
| 
 | |
| # It's a list of project name which SETA is useful on
 | |
| SETA_PROJECTS = ['mozilla-inbound', 'autoland']
 | |
| PROJECT_SCHEDULE_ALL_EVERY_PUSHES = {'mozilla-inbound': 5, 'autoland': 5}
 | |
| PROJECT_SCHEDULE_ALL_EVERY_MINUTES = {'mozilla-inbound': 60, 'autoland': 60}
 | |
| 
 | |
| SETA_ENDPOINT = "https://treeherder.mozilla.org/api/project/%s/seta/" \
 | |
|                 "job-priorities/?build_system_type=%s"
 | |
| PUSH_ENDPOINT = "https://hg.mozilla.org/integration/%s/json-pushes/?startID=%d&endID=%d"
 | |
| 
 | |
| 
 | |
| class SETA(object):
 | |
|     """
 | |
|     Interface to the SETA service, which defines low-value tasks that can be optimized out
 | |
|     of the taskgraph.
 | |
|     """
 | |
|     def __init__(self):
 | |
|         # cached low value tasks, by project
 | |
|         self.low_value_tasks = {}
 | |
|         self.low_value_bb_tasks = {}
 | |
|         # cached push dates by project
 | |
|         self.push_dates = defaultdict(dict)
 | |
|         # cached push_ids that failed to retrieve datetime for
 | |
|         self.failed_json_push_calls = []
 | |
| 
 | |
|     def _get_task_string(self, task_tuple):
 | |
|         # convert task tuple to single task string, so the task label sent in can match
 | |
|         # remove any empty parts of the tuple
 | |
|         task_tuple = [x for x in task_tuple if len(x) != 0]
 | |
| 
 | |
|         if len(task_tuple) == 0:
 | |
|             return ''
 | |
|         if len(task_tuple) != 3:
 | |
|             return ' '.join(task_tuple)
 | |
| 
 | |
|         return 'test-%s/%s-%s' % (task_tuple[0], task_tuple[1], task_tuple[2])
 | |
| 
 | |
|     def query_low_value_tasks(self, project):
 | |
|         # Request the set of low value tasks from the SETA service.  Low value
 | |
|         # tasks will be optimized out of the task graph.
 | |
|         low_value_tasks = []
 | |
| 
 | |
|         # we want to get low priority taskcluster jobs
 | |
|         url = SETA_ENDPOINT % (project, 'taskcluster')
 | |
| 
 | |
|         # Try to fetch the SETA data twice, falling back to an empty list of low value tasks.
 | |
|         # There are 10 seconds between each try.
 | |
|         try:
 | |
|             logger.debug("Retrieving low-value jobs list from SETA")
 | |
|             response = retry(requests.get, attempts=2, sleeptime=10,
 | |
|                              args=(url, ),
 | |
|                              kwargs={'timeout': 60, 'headers': ''})
 | |
|             task_list = json.loads(response.content).get('jobtypes', '')
 | |
| 
 | |
|             if type(task_list) == dict and len(task_list) > 0:
 | |
|                 if type(task_list.values()[0]) == list and len(task_list.values()[0]) > 0:
 | |
|                     low_value_tasks = task_list.values()[0]
 | |
|                     # bb job types return a list instead of a single string,
 | |
|                     # convert to a single string to match tc tasks format
 | |
|                     if type(low_value_tasks[0]) == list:
 | |
|                         low_value_tasks = [self._get_task_string(x) for x in low_value_tasks]
 | |
| 
 | |
|             # ensure no build tasks slipped in, we never want to optimize out those
 | |
|             low_value_tasks = [x for x in low_value_tasks if 'build' not in x.lower()]
 | |
| 
 | |
|         # In the event of request times out, requests will raise a TimeoutError.
 | |
|         except exceptions.Timeout:
 | |
|             logger.warning("SETA timeout, we will treat all test tasks as high value.")
 | |
| 
 | |
|         # In the event of a network problem (e.g. DNS failure, refused connection, etc),
 | |
|         # requests will raise a ConnectionError.
 | |
|         except exceptions.ConnectionError:
 | |
|             logger.warning("SETA connection error, we will treat all test tasks as high value.")
 | |
| 
 | |
|         # In the event of the rare invalid HTTP response(e.g 404, 401),
 | |
|         # requests will raise an HTTPError exception
 | |
|         except exceptions.HTTPError:
 | |
|             logger.warning("We got bad Http response from ouija,"
 | |
|                            " we will treat all test tasks as high value.")
 | |
| 
 | |
|         # We just print the error out as a debug message if we failed to catch the exception above
 | |
|         except exceptions.RequestException as error:
 | |
|             logger.warning(error)
 | |
| 
 | |
|         # When we get invalid JSON (i.e. 500 error), it results in a ValueError (bug 1313426)
 | |
|         except ValueError as error:
 | |
|             logger.warning("Invalid JSON, possible server error: {}".format(error))
 | |
| 
 | |
|         return low_value_tasks
 | |
| 
 | |
|     def minutes_between_pushes(self, project, cur_push_id, cur_push_date):
 | |
|         # figure out the minutes that have elapsed between the current push and previous one
 | |
|         # defaulting to max min so if we can't get value, defaults to run the task
 | |
|         min_between_pushes = PROJECT_SCHEDULE_ALL_EVERY_MINUTES.get(project, 60)
 | |
|         prev_push_id = cur_push_id - 1
 | |
| 
 | |
|         # cache the pushdate for the current push so we can use it next time
 | |
|         self.push_dates[project].update({cur_push_id: cur_push_date})
 | |
| 
 | |
|         # check if we already have the previous push id's datetime cached
 | |
|         prev_push_date = self.push_dates[project].get(prev_push_id, 0)
 | |
| 
 | |
|         # we have datetime of current and previous push, so return elapsed minutes and bail
 | |
|         if cur_push_date > 0 and prev_push_date > 0:
 | |
|             return (cur_push_date - prev_push_date) / 60
 | |
| 
 | |
|         # datetime for previous pushid not cached, so must retrieve it
 | |
|         # if we already tried to retrieve the datetime for this pushid
 | |
|         # before and the json-push request failed, don't try it again
 | |
|         if prev_push_id in self.failed_json_push_calls:
 | |
|             return min_between_pushes
 | |
| 
 | |
|         url = PUSH_ENDPOINT % (project, cur_push_id - 2, prev_push_id)
 | |
| 
 | |
|         try:
 | |
|             response = retry(requests.get, attempts=2, sleeptime=10,
 | |
|                              args=(url, ),
 | |
|                              kwargs={'timeout': 60, 'headers': {'User-Agent': 'TaskCluster'}})
 | |
|             prev_push_date = json.loads(response.content).get(str(prev_push_id), {}).get('date', 0)
 | |
| 
 | |
|             # cache it for next time
 | |
|             self.push_dates[project].update({prev_push_id: prev_push_date})
 | |
| 
 | |
|             # now have datetime of current and previous push
 | |
|             if cur_push_date > 0 and prev_push_date > 0:
 | |
|                 min_between_pushes = (cur_push_date - prev_push_date) / 60
 | |
| 
 | |
|         # In the event of request times out, requests will raise a TimeoutError.
 | |
|         except exceptions.Timeout:
 | |
|             logger.warning("json-pushes timeout, treating task as high value")
 | |
|             self.failed_json_push_calls.append(prev_push_id)
 | |
| 
 | |
|         # In the event of a network problem (e.g. DNS failure, refused connection, etc),
 | |
|         # requests will raise a ConnectionError.
 | |
|         except exceptions.ConnectionError:
 | |
|             logger.warning("json-pushes connection error, treating task as high value")
 | |
|             self.failed_json_push_calls.append(prev_push_id)
 | |
| 
 | |
|         # In the event of the rare invalid HTTP response(e.g 404, 401),
 | |
|         # requests will raise an HTTPError exception
 | |
|         except exceptions.HTTPError:
 | |
|             logger.warning("Bad Http response, treating task as high value")
 | |
|             self.failed_json_push_calls.append(prev_push_id)
 | |
| 
 | |
|         # When we get invalid JSON (i.e. 500 error), it results in a ValueError (bug 1313426)
 | |
|         except ValueError as error:
 | |
|             logger.warning("Invalid JSON, possible server error: {}".format(error))
 | |
|             self.failed_json_push_calls.append(prev_push_id)
 | |
| 
 | |
|         # We just print the error out as a debug message if we failed to catch the exception above
 | |
|         except exceptions.RequestException as error:
 | |
|             logger.warning(error)
 | |
|             self.failed_json_push_calls.append(prev_push_id)
 | |
| 
 | |
|         return min_between_pushes
 | |
| 
 | |
|     def is_low_value_task(self, label, project, pushlog_id, push_date):
 | |
|         # marking a task as low_value means it will be optimized out by tc
 | |
|         if project not in SETA_PROJECTS:
 | |
|             return False
 | |
| 
 | |
|         schedule_all_every = PROJECT_SCHEDULE_ALL_EVERY_PUSHES.get(project, 5)
 | |
|         # on every Nth push, want to run all tasks
 | |
|         if int(pushlog_id) % schedule_all_every == 0:
 | |
|             return False
 | |
| 
 | |
|         # Nth push, so time to call seta based on number of pushes; however
 | |
|         # we also want to ensure we run all tasks at least once per N minutes
 | |
|         if self.minutes_between_pushes(
 | |
|                 project,
 | |
|                 int(pushlog_id),
 | |
|                 int(push_date)) >= PROJECT_SCHEDULE_ALL_EVERY_MINUTES.get(project, 60):
 | |
|             return False
 | |
| 
 | |
|         # cache the low value tasks per project to avoid repeated SETA server queries
 | |
|         if project not in self.low_value_tasks:
 | |
|             self.low_value_tasks[project] = self.query_low_value_tasks(project)
 | |
|         return label in self.low_value_tasks[project]
 | |
| 
 | |
| 
 | |
| # create a single instance of this class, and expose its `is_low_value_task`
 | |
| # bound method as a module-level function
 | |
| is_low_value_task = SETA().is_low_value_task
 |