Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

[Rest API] Add task state; Add job's retry details; Refine job config #2306

Merged
merged 10 commits into from
Mar 29, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions docs/rest-server/API.md
Original file line number Diff line number Diff line change
Expand Up @@ -440,10 +440,16 @@ Status: 200
createdTime: "createdTimestamp",
completedTime: "completedTimestamp",
executionType: "executionType",
// Sum of succeededRetriedCount, transientNormalRetriedCount,
// transientConflictRetriedCount, nonTransientRetriedCount,
// and unKnownRetriedCount
retries: retriedCount,
// sum of retries
retries: retries,
retryDetails: {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

better name to be retryBreakdown?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think "details" is better...

// Job failed due to user or unknown error
user: userRetries,
// Job failed due to platform error
platform: platformRetries,
// Job cannot get required resource to run within timeout
resource: resourceRetries,
},
appId: "applicationId",
appProgress: "applicationProgress",
appTrackingUrl: "applicationTrackingUrl",
Expand All @@ -461,6 +467,7 @@ Status: 200
},
taskStatuses: {
taskIndex: taskIndex,
taskState: taskState,
containerId: "containerId",
containerIp: "containerIp",
containerPorts: {
Expand Down
7 changes: 5 additions & 2 deletions src/rest-server/src/controllers/job.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

// module dependencies
const yaml = require('js-yaml');
const Job = require('../models/job');
const createError = require('../util/error');
const logger = require('../config/logger');
Expand Down Expand Up @@ -149,8 +150,10 @@ const getConfig = (req, res, next) => {
req.job.name,
(error, result) => {
if (!error) {
// result maybe json or yaml, depends on the job type user submitted.
return typeof(result) == 'string' ? res.status(200).json(result) : res.status(200).send(result).type('yaml');
const data = yaml.safeLoad(result);
const type = req.accepts(['json', 'yaml']) || 'json';
const body = type === 'json' ? JSON.stringify(data) : yaml.safeDump(data);
return res.status(200).type(type).send(body);
} else if (error.message.startsWith('[WebHDFS] 404')) {
return next(createError('Not Found', 'NoJobConfigError', `Config of job ${req.job.name} is not found.`));
} else {
Expand Down
73 changes: 54 additions & 19 deletions src/rest-server/src/models/job.js
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ class Job {
jobState = 'RUNNING';
break;
case 'FRAMEWORK_COMPLETED':
if (typeof exitCode !== 'undefined' && parseInt(exitCode) === 0) {
if (exitCode === 0) {
jobState = 'SUCCEEDED';
} else if (typeof exitCode !== 'undefined' && parseInt(exitCode) == 214) {
} else if (exitCode === 214) {
jobState = 'STOPPED';
} else {
jobState = 'FAILED';
Expand All @@ -73,6 +73,26 @@ class Job {
return jobState;
}

convertTaskState(taskState, exitCode) {
switch (taskState) {
case 'TASK_WAITING':
case 'CONTAINER_REQUESTED':
case 'CONTAINER_ALLOCATED':
case 'CONTAINER_COMPLETED':
return 'WAITING';
case 'CONTAINER_RUNNING':
return 'RUNNING';
case 'TASK_COMPLETED':
if (exitCode === 0) {
return 'SUCCEEDED';
} else {
return 'FAILED';
}
default:
return 'UNKNOWN';
}
}

getJobList(query, namespace, next) {
let reqPath = launcherConfig.frameworksPath();
if (namespace) {
Expand All @@ -90,18 +110,30 @@ class Job {
return next(null, createError(res.status, 'UnknownError', res.raw_body));
}
let jobList = resJson.summarizedFrameworkInfos.map((frameworkInfo) => {
let retries = 0;
['succeededRetriedCount', 'transientNormalRetriedCount', 'transientConflictRetriedCount',
'nonTransientRetriedCount', 'unKnownRetriedCount'].forEach((retry) => {
retries += frameworkInfo.frameworkRetryPolicyState[retry];
});
// 1. transientNormalRetriedCount
// Failed, and it can ensure that it will success within a finite retry times:
// such as dependent components shutdown, machine error, network error,
// configuration error, environment error...
// 2. transientConflictRetriedCount
// A special TRANSIENT_NORMAL which indicate the exit due to resource conflict
// and cannot get required resource to run.
// 3. unKnownRetriedCount
// Usually caused by user's code.
const platformRetries = frameworkInfo.frameworkRetryPolicyState.transientNormalRetriedCount;
const resourceRetries = frameworkInfo.frameworkRetryPolicyState.transientConflictRetriedCount;
const userRetries = frameworkInfo.frameworkRetryPolicyState.unKnownRetriedCount;
const job = {
name: frameworkInfo.frameworkName,
username: frameworkInfo.userName,
state: this.convertJobState(frameworkInfo.frameworkState, frameworkInfo.applicationExitCode),
subState: frameworkInfo.frameworkState,
executionType: frameworkInfo.executionType,
retries: retries,
retries: platformRetries + resourceRetries + userRetries,
retryDetails: {
user: userRetries,
platform: platformRetries,
resource: resourceRetries,
},
createdTime: frameworkInfo.firstRequestTimestamp || new Date(2018, 1, 1).getTime(),
completedTime: frameworkInfo.frameworkCompletedTimestamp,
appExitCode: frameworkInfo.applicationExitCode,
Expand Down Expand Up @@ -252,7 +284,7 @@ class Job {
null,
(error, result) => {
if (!error) {
next(null, JSON.stringify(JSON.parse(result.content), null, 2));
next(null, result.content);
} else {
next(error);
}
Expand Down Expand Up @@ -333,22 +365,24 @@ class Job {
if (frameworkStatus) {
const jobState = this.convertJobState(
frameworkStatus.frameworkState,
frameworkStatus.applicationExitCode);
let jobRetryCount = 0;
const jobRetryCountInfo = frameworkStatus.frameworkRetryPolicyState;
jobRetryCount =
jobRetryCountInfo.succeededRetriedCount +
jobRetryCountInfo.transientNormalRetriedCount +
jobRetryCountInfo.transientConflictRetriedCount +
jobRetryCountInfo.nonTransientRetriedCount +
jobRetryCountInfo.unKnownRetriedCount;
frameworkStatus.applicationExitCode,
);

const platformRetries = frameworkStatus.frameworkRetryPolicyState.transientNormalRetriedCount;
const resourceRetries = frameworkStatus.frameworkRetryPolicyState.transientConflictRetriedCount;
const userRetries = frameworkStatus.frameworkRetryPolicyState.unKnownRetriedCount;
jobDetail.jobStatus = {
name: framework.name,
username: 'unknown',
state: jobState,
subState: frameworkStatus.frameworkState,
executionType: framework.summarizedFrameworkInfo.executionType,
retries: jobRetryCount,
retries: platformRetries + resourceRetries + userRetries,
retryDetails: {
user: userRetries,
platform: platformRetries,
resource: resourceRetries,
},
createdTime: frameworkStatus.frameworkCreatedTimestamp,
completedTime: frameworkStatus.frameworkCompletedTimestamp,
appId: frameworkStatus.applicationId,
Expand Down Expand Up @@ -388,6 +422,7 @@ class Job {
}
jobDetail.taskRoles[taskRole].taskStatuses.push({
taskIndex: task.taskIndex,
taskState: this.convertTaskState(task.taskState, task.containerExitCode),
containerId: task.containerId,
containerIp: task.containerIp,
containerPorts,
Expand Down