Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

[Rest API] Add task state; Add job's retry details; Refine job config #2306

Merged
merged 10 commits into from
Mar 29, 2019
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions docs/rest-server/API.md
Original file line number Diff line number Diff line change
Expand Up @@ -440,10 +440,16 @@ Status: 200
createdTime: "createdTimestamp",
completedTime: "completedTimestamp",
executionType: "executionType",
// Sum of succeededRetriedCount, transientNormalRetriedCount,
// transientConflictRetriedCount, nonTransientRetriedCount,
// and unKnownRetriedCount
retries: retriedCount,
// sum of retries
retries: retries,
retryDetails: {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

better name to be retryBreakdown?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think "details" is better...

// Job cannot get required resource to run within timeout
sunqinzheng marked this conversation as resolved.
Show resolved Hide resolved
user: userRetries,
// Job failed due to system/plaform error
system: systemRetries,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"platform" is better than "system" due to:

  1. More accurate, user job may also be a "system"
  2. Our PAI is "platform" for AI, not "system" for AI
  3. There are PaaS, but no "system" as a Service

so can you replace all "system" words with "platform" in this PR?

Copy link
Contributor Author

@sunqinzheng sunqinzheng Mar 25, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find that "system" is used in rest api document (https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md#post-token) multiple times. Shall we replace them as well?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems there is no need to distingush with user in that doc, so no ambiguity in the doc (the "system" in the doc implies PAI).
So, it is better to replace "in the system" to "in PAI", but it is not necessary, it depends on you :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

// Job failed due to user or unknown error
resource: resourceRetries,
},
appId: "applicationId",
appProgress: "applicationProgress",
appTrackingUrl: "applicationTrackingUrl",
Expand All @@ -461,6 +467,7 @@ Status: 200
},
taskStatuses: {
taskIndex: taskIndex,
taskState: taskState,
containerId: "containerId",
containerIp: "containerIp",
containerPorts: {
Expand Down
7 changes: 5 additions & 2 deletions src/rest-server/src/controllers/job.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

// module dependencies
const yaml = require('js-yaml');
const Job = require('../models/job');
const createError = require('../util/error');
const logger = require('../config/logger');
Expand Down Expand Up @@ -149,8 +150,10 @@ const getConfig = (req, res, next) => {
req.job.name,
(error, result) => {
if (!error) {
// result maybe json or yaml, depends on the job type user submitted.
return typeof(result) == 'string' ? res.status(200).json(result) : res.status(200).send(result).type('yaml');
const data = yaml.safeLoad(result);
const type = req.accepts(['json', 'yaml']) || 'json';
const body = type === 'json' ? JSON.stringify(data) : yaml.safeDump(data);
return res.status(200).type(type).send(body);
} else if (error.message.startsWith('[WebHDFS] 404')) {
return next(createError('Not Found', 'NoJobConfigError', `Config of job ${req.job.name} is not found.`));
} else {
Expand Down
73 changes: 54 additions & 19 deletions src/rest-server/src/models/job.js
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ class Job {
jobState = 'RUNNING';
break;
case 'FRAMEWORK_COMPLETED':
if (typeof exitCode !== 'undefined' && parseInt(exitCode) === 0) {
if (exitCode === 0) {
jobState = 'SUCCEEDED';
} else if (typeof exitCode !== 'undefined' && parseInt(exitCode) == 214) {
} else if (exitCode === 214) {
jobState = 'STOPPED';
} else {
jobState = 'FAILED';
Expand All @@ -73,6 +73,26 @@ class Job {
return jobState;
}

convertTaskState(taskState, exitCode) {
switch (taskState) {
case 'TASK_WAITING':
case 'CONTAINER_REQUESTED':
case 'CONTAINER_ALLOCATED':
case 'CONTAINER_COMPLETED':
return 'WAITING';
case 'CONTAINER_RUNNING':
return 'RUNNING';
case 'TASK_COMPLETED':
if (exitCode === 0) {
return 'SUCCEEDED';
} else {
return 'FAILED';
}
default:
return 'UNKNOWN';
}
}

getJobList(query, namespace, next) {
let reqPath = launcherConfig.frameworksPath();
if (namespace) {
Expand All @@ -90,18 +110,30 @@ class Job {
return next(null, createError(res.status, 'UnknownError', res.raw_body));
}
let jobList = resJson.summarizedFrameworkInfos.map((frameworkInfo) => {
let retries = 0;
['succeededRetriedCount', 'transientNormalRetriedCount', 'transientConflictRetriedCount',
'nonTransientRetriedCount', 'unKnownRetriedCount'].forEach((retry) => {
retries += frameworkInfo.frameworkRetryPolicyState[retry];
});
// 1. transientNormalRetriedCount
// Failed, and it can ensure that it will success within a finite retry times:
// such as dependent components shutdown, machine error, network error,
// configuration error, environment error...
// 2. transientConflictRetriedCount
// A special TRANSIENT_NORMAL which indicate the exit due to resource conflict
// and cannot get required resource to run.
// 3. unKnownRetriedCount
// Usually caused by user's code.
const systemRetries = frameworkInfo.frameworkRetryPolicyState.transientNormalRetriedCount;
const resourceRetries = frameworkInfo.frameworkRetryPolicyState.transientConflictRetriedCount;
const userRetries = frameworkInfo.frameworkRetryPolicyState.unKnownRetriedCount;
const job = {
name: frameworkInfo.frameworkName,
username: frameworkInfo.userName,
state: this.convertJobState(frameworkInfo.frameworkState, frameworkInfo.applicationExitCode),
subState: frameworkInfo.frameworkState,
executionType: frameworkInfo.executionType,
retries: retries,
retries: systemRetries + resourceRetries + userRetries,
retryDetails: {
user: userRetries,
system: systemRetries,
resource: resourceRetries,
},
createdTime: frameworkInfo.firstRequestTimestamp || new Date(2018, 1, 1).getTime(),
completedTime: frameworkInfo.frameworkCompletedTimestamp,
appExitCode: frameworkInfo.applicationExitCode,
Expand Down Expand Up @@ -252,7 +284,7 @@ class Job {
null,
(error, result) => {
if (!error) {
next(null, JSON.stringify(JSON.parse(result.content), null, 2));
next(null, result.content);
} else {
next(error);
}
Expand Down Expand Up @@ -333,22 +365,24 @@ class Job {
if (frameworkStatus) {
const jobState = this.convertJobState(
frameworkStatus.frameworkState,
frameworkStatus.applicationExitCode);
let jobRetryCount = 0;
const jobRetryCountInfo = frameworkStatus.frameworkRetryPolicyState;
jobRetryCount =
jobRetryCountInfo.succeededRetriedCount +
jobRetryCountInfo.transientNormalRetriedCount +
jobRetryCountInfo.transientConflictRetriedCount +
jobRetryCountInfo.nonTransientRetriedCount +
jobRetryCountInfo.unKnownRetriedCount;
frameworkStatus.applicationExitCode,
);

const systemRetries = frameworkStatus.frameworkRetryPolicyState.transientNormalRetriedCount;
const resourceRetries = frameworkStatus.frameworkRetryPolicyState.transientConflictRetriedCount;
const userRetries = frameworkStatus.frameworkRetryPolicyState.unKnownRetriedCount;
jobDetail.jobStatus = {
name: framework.name,
username: 'unknown',
state: jobState,
subState: frameworkStatus.frameworkState,
executionType: framework.summarizedFrameworkInfo.executionType,
retries: jobRetryCount,
retries: systemRetries + resourceRetries + userRetries,
retryDetails: {
user: userRetries,
system: systemRetries,
resource: resourceRetries,
},
createdTime: frameworkStatus.frameworkCreatedTimestamp,
completedTime: frameworkStatus.frameworkCompletedTimestamp,
appId: frameworkStatus.applicationId,
Expand Down Expand Up @@ -388,6 +422,7 @@ class Job {
}
jobDetail.taskRoles[taskRole].taskStatuses.push({
taskIndex: task.taskIndex,
taskState: this.convertTaskState(task.taskState, task.containerExitCode),
containerId: task.containerId,
containerIp: task.containerIp,
containerPorts,
Expand Down