[ui, deployments] Job status for System Jobs (#17046)

* System jobs get a panel and lost status reinstated

* Leveraging nodes and not worrying about rescheds for system jobs

* Consistency w restarted as well

* Text shadow removed and early return where possible

* System jobs added to the Historical Click list

* System alloc and client summary panels removed

* Bones of some new system jobs tests

* [ui, deployments] handle node read permissions for system job panel (#17073)

* Do the next-best-thing when we cant read nodes for system jobs

* Whitespace control handlebars expr

* Simplifies system jobs to not attempt to show a desired count, since it is a particularly complex number depending on constraints, number of nodes, etc.

* [ui, deployments] Fix order in which allocations are ascribed to the status chart (#17063)

* Discovery of alloc.isOld

* Correct sorting and better types

* A more honest walk-back that prioritizes running and pending allocs first

* Test scenario for descending-order allocs to show

* isOld mandates that we set a job version for our created job. Could also do this in the factory but maybe side-effecty

* Type simplification

* Fixed up a test that needed system job summary to be updated

* Tests for modifications to the job summary

* Explicitly mark the service jobs in test as not-deploying
This commit is contained in:
Phil Renaud 2023-05-05 16:25:21 -04:00 committed by GitHub
parent 2cf27389ad
commit 2fbbac5dd8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 378 additions and 92 deletions

View File

@ -1,5 +1,5 @@
{{#if this.isActivelyDeploying}}
<JobStatus::Panel::Deploying @job={{@job}} @handleError={{@handleError}} />
{{else}}
<JobStatus::Panel::Steady @job={{@job}} @statusMode={{@statusMode}} @setStatusMode={{@setStatusMode}} />
<JobStatus::Panel::Steady @job={{@job}} @statusMode={{@statusMode}} @setStatusMode={{@setStatusMode}} @nodes={{this.nodes}} />
{{/if}}

View File

@ -1,8 +1,18 @@
// @ts-check
import Component from '@glimmer/component';
import { inject as service } from '@ember/service';
export default class JobStatusPanelComponent extends Component {
@service store;
get isActivelyDeploying() {
return this.args.job.get('latestDeployment.isRunning');
}
get nodes() {
if (!this.args.job.get('hasClientStatus')) {
return [];
}
return this.store.peekAll('node');
}
}

View File

@ -30,9 +30,7 @@ export default class JobStatusPanelDeployingComponent extends Component {
// allocations we can track throughout a deployment
establishOldAllocBlockIDs() {
this.oldVersionAllocBlockIDs = this.job.allocations.filter(
(a) =>
a.clientStatus === 'running' &&
a.jobVersion !== this.deployment.get('versionNumber')
(a) => a.clientStatus === 'running' && a.isOld
);
}
@ -84,7 +82,7 @@ export default class JobStatusPanelDeployingComponent extends Component {
get newVersionAllocBlocks() {
let availableSlotsToFill = this.desiredTotal;
let allocationsOfDeploymentVersion = this.job.allocations.filter(
(a) => a.jobVersion === this.deployment.get('versionNumber')
(a) => !a.isOld
);
let allocationCategories = this.allocTypes.reduce((categories, type) => {
@ -153,19 +151,11 @@ export default class JobStatusPanelDeployingComponent extends Component {
}
get rescheduledAllocs() {
return this.job.allocations.filter(
(a) =>
a.jobVersion === this.job.latestDeployment.get('versionNumber') &&
a.hasBeenRescheduled
);
return this.job.allocations.filter((a) => !a.isOld && a.hasBeenRescheduled);
}
get restartedAllocs() {
return this.job.allocations.filter(
(a) =>
a.jobVersion === this.job.latestDeployment.get('versionNumber') &&
a.hasBeenRestarted
);
return this.job.allocations.filter((a) => !a.isOld && a.hasBeenRestarted);
}
// #region legend

View File

@ -21,7 +21,14 @@
{{#if (eq @statusMode "historical")}}
<JobPage::Parts::SummaryChart @job={{@job}} />
{{else}}
<h3 class="title is-4 running-allocs-title"><strong>{{@job.runningAllocs}}/{{this.totalAllocs}}</strong> Allocations Running</h3>
<h3 class="title is-4 running-allocs-title">
<strong>
{{@job.runningAllocs ~}}
{{#unless this.atMostOneAllocPerNode ~}}
/{{this.totalAllocs}}
{{/unless}}
</strong>
{{pluralize "Allocation" @job.runningAllocs}} Running</h3>
<JobStatus::AllocationStatusRow @allocBlocks={{this.allocBlocks}} @steady={{true}} />
<div class="legend-and-summary">
@ -41,12 +48,14 @@
{{/each}}
</legend>
<JobStatus::FailedOrLost
@allocs={{this.rescheduledAllocs}}
@job={{@job}}
@title="Rescheduled"
@description="Allocations that have been rescheduled, on another node if possible, due to failure"
/>
{{#if this.supportsRescheduling}}
<JobStatus::FailedOrLost
@allocs={{this.rescheduledAllocs}}
@job={{@job}}
@title="Rescheduled"
@description="Allocations that have been rescheduled, on another node if possible, due to failure"
/>
{{/if}}
<JobStatus::FailedOrLost
@allocs={{this.restartedAllocs}}
@ -60,7 +69,7 @@
<ul>
{{#each-in this.versions as |version allocs|}}
<li>
<LinkTo @route="jobs.job.allocations" @model={{@job}} @query={{hash version=(concat '[' version ']') status=(concat '["running", "pending", "failed"]') }}>
<LinkTo data-version={{version}} @route="jobs.job.allocations" @model={{@job}} @query={{hash version=(concat '[' version ']') status=(concat '["running", "pending", "failed"]') }}>
<Hds::Badge @text={{concat "v" version}} />
<Hds::BadgeCount @text={{allocs.length}} @type="inverted" />
</LinkTo>

View File

@ -21,32 +21,91 @@ export default class JobStatusPanelSteadyComponent extends Component {
};
});
/**
* @typedef {Object} HealthStatus
* @property {Array} nonCanary
* @property {Array} canary
*/
/**
* @typedef {Object} AllocationStatus
* @property {HealthStatus} healthy
* @property {HealthStatus} unhealthy
*/
/**
* @typedef {Object} AllocationBlock
* @property {AllocationStatus} [RUNNING]
* @property {AllocationStatus} [PENDING]
* @property {AllocationStatus} [FAILED]
* @property {AllocationStatus} [LOST]
* @property {AllocationStatus} [UNPLACED]
*/
/**
* Looks through running/pending allocations with the aim of filling up your desired number of allocations.
* If any desired remain, it will walk backwards through job versions and other allocation types to build
* a picture of the job's overall status.
*
* @returns {AllocationBlock} An object containing healthy non-canary allocations
* for each clientStatus.
*/
get allocBlocks() {
let availableSlotsToFill = this.totalAllocs;
// Only fill up to 100% of totalAllocs. Once we've filled up, we can stop counting.
let allocationsOfShowableType = this.allocTypes.reduce((blocks, type) => {
const jobAllocsOfType = this.args.job.allocations
.sortBy('jobVersion') // Try counting from latest deployment's allocs and work backwards if needed
.reverse()
.filterBy('clientStatus', type.label);
if (availableSlotsToFill > 0) {
blocks[type.label] = {
healthy: {
nonCanary: Array(
Math.min(availableSlotsToFill, jobAllocsOfType.length)
)
.fill()
.map((_, i) => {
return jobAllocsOfType[i];
}),
},
};
availableSlotsToFill -= blocks[type.label].healthy.nonCanary.length;
} else {
blocks[type.label] = { healthy: { nonCanary: [] } };
// Initialize allocationsOfShowableType with empty arrays for each clientStatus
/**
* @type {AllocationBlock}
*/
let allocationsOfShowableType = this.allocTypes.reduce(
(accumulator, type) => {
accumulator[type.label] = { healthy: { nonCanary: [] } };
return accumulator;
},
{}
);
// First accumulate the Running/Pending allocations
for (const alloc of this.job.allocations.filter(
(a) => a.clientStatus === 'running' || a.clientStatus === 'pending'
)) {
if (availableSlotsToFill === 0) {
break;
}
return blocks;
}, {});
const status = alloc.clientStatus;
allocationsOfShowableType[status].healthy.nonCanary.push(alloc);
availableSlotsToFill--;
}
// Sort all allocs by jobVersion in descending order
const sortedAllocs = this.args.job.allocations
.filter(
(a) => a.clientStatus !== 'running' && a.clientStatus !== 'pending'
)
.sortBy('jobVersion')
.reverse();
// Iterate over the sorted allocs
for (const alloc of sortedAllocs) {
if (availableSlotsToFill === 0) {
break;
}
const status = alloc.clientStatus;
// If the alloc has another clientStatus, add it to the corresponding list
// as long as we haven't reached the totalAllocs limit for that clientStatus
if (
this.allocTypes.map(({ label }) => label).includes(status) &&
allocationsOfShowableType[status].healthy.nonCanary.length <
this.totalAllocs
) {
allocationsOfShowableType[status].healthy.nonCanary.push(alloc);
availableSlotsToFill--;
}
}
// Handle unplaced allocs
if (availableSlotsToFill > 0) {
allocationsOfShowableType['unplaced'] = {
healthy: {
@ -58,16 +117,26 @@ export default class JobStatusPanelSteadyComponent extends Component {
},
};
}
return allocationsOfShowableType;
}
// TODO: eventually we will want this from a new property on a job.
get totalAllocs() {
// v----- Experimental method: Count all allocs. Good for testing but not a realistic representation of "Desired"
// return this.allocTypes.reduce((sum, type) => sum + this.args.job[type.property], 0);
get nodes() {
return this.args.nodes;
}
// v----- Realistic method: Tally a job's task groups' "count" property
return this.args.job.taskGroups.reduce((sum, tg) => sum + tg.count, 0);
get totalAllocs() {
if (this.args.job.type === 'service') {
return this.args.job.taskGroups.reduce((sum, tg) => sum + tg.count, 0);
} else if (this.atMostOneAllocPerNode) {
return this.args.job.allocations.uniqBy('nodeID').length;
} else {
return this.args.job.count; // TODO: this is probably not the correct totalAllocs count for any type.
}
}
get atMostOneAllocPerNode() {
return this.args.job.type === 'system';
}
get versions() {
@ -86,18 +155,14 @@ export default class JobStatusPanelSteadyComponent extends Component {
}
get rescheduledAllocs() {
return this.job.allocations.filter(
(a) =>
a.jobVersion === this.job.latestDeployment.get('versionNumber') &&
a.hasBeenRescheduled
);
return this.job.allocations.filter((a) => !a.isOld && a.hasBeenRescheduled);
}
get restartedAllocs() {
return this.job.allocations.filter(
(a) =>
a.jobVersion === this.job.latestDeployment.get('versionNumber') &&
a.hasBeenRestarted
);
return this.job.allocations.filter((a) => !a.isOld && a.hasBeenRestarted);
}
get supportsRescheduling() {
return this.job.type !== 'system';
}
}

View File

@ -32,6 +32,7 @@ export default class Allocation extends Model {
@belongsTo('job') job;
@belongsTo('node') node;
@attr('string') namespace;
@attr('string') nodeID;
@attr('string') name;
@attr('string') taskGroupName;
@fragment('resources') resources;

View File

@ -159,7 +159,6 @@
// TODO: we eventually want to establish a minimum width here. However, we need to also include this in the allocation-status-block width computation.
font-size: 0.8rem;
font-weight: bold;
text-shadow: 1px 1px 2px rgba(0, 0, 0, 0.3);
width: 100%;
& > .rest-count {

View File

@ -9,8 +9,7 @@
<jobPage.ui.Title />
<jobPage.ui.StatsBox />
<jobPage.ui.DasRecommendations />
<jobPage.ui.JobClientStatusSummary />
<jobPage.ui.Summary @forceCollapsed="true" />
<jobPage.ui.StatusPanel @statusMode={{@statusMode}} @setStatusMode={{@setStatusMode}} />
<jobPage.ui.PlacementFailures />
<jobPage.ui.TaskGroups @sortProperty={{@sortProperty}} @sortDescending={{@sortDescending}} />
<jobPage.ui.RecentAllocations @activeTask={{@activeTask}} @setActiveTaskQueryParam={{@setActiveTaskQueryParam}} />

View File

@ -21,30 +21,11 @@ moduleForJob('Acceptance | job detail (batch)', 'allocations', () =>
);
moduleForJob('Acceptance | job detail (system)', 'allocations', () =>
server.create('job', { type: 'system', shallow: true })
);
moduleForJobWithClientStatus(
'Acceptance | job detail with client status (system)',
() =>
server.create('job', {
status: 'running',
datacenters: ['dc1'],
type: 'system',
createAllocations: false,
})
);
moduleForJobWithClientStatus(
'Acceptance | job detail with client status (system with wildcard dc)',
() =>
server.create('job', {
id: 'system-wildcard-dc',
status: 'running',
datacenters: ['canada-*-1'],
type: 'system',
createAllocations: false,
})
server.create('job', {
type: 'system',
shallow: true,
noActiveDeployment: true,
})
);
moduleForJob('Acceptance | job detail (sysbatch)', 'allocations', () =>
@ -244,7 +225,7 @@ moduleForJob(
moduleForJob(
'Acceptance | job detail (service)',
'allocations',
() => server.create('job', { type: 'service' }),
() => server.create('job', { type: 'service', noActiveDeployment: true }),
{
'the subnav links to deployment': async (job, assert) => {
await JobDetail.tabFor('deployments').visit();
@ -285,6 +266,7 @@ module('Acceptance | job detail (with namespaces)', function (hooks) {
type: 'service',
status: 'running',
namespaceId: server.db.namespaces[1].name,
noActiveDeployment: true,
});
server.createList('job', 3, {
namespaceId: server.db.namespaces[0].name,
@ -433,6 +415,7 @@ module('Acceptance | job detail (with namespaces)', function (hooks) {
namespaceId: server.db.namespaces[1].name,
groupsCount: 3,
createRecommendations: true,
noActiveDeployment: true,
});
window.localStorage.nomadTokenSecret = managementToken.secretId;

View File

@ -176,6 +176,84 @@ module('Acceptance | job status panel', function (hooks) {
});
});
test('After running/pending allocations are covered, fill in allocs by jobVersion, descending', async function (assert) {
assert.expect(9);
let job = server.create('job', {
status: 'running',
datacenters: ['*'],
type: 'service',
resourceSpec: ['M: 256, C: 500'], // a single group
createAllocations: false,
groupTaskCount: 4,
shallow: true,
version: 5,
});
server.create('allocation', {
jobId: job.id,
clientStatus: 'running',
jobVersion: 5,
});
server.create('allocation', {
jobId: job.id,
clientStatus: 'pending',
jobVersion: 5,
});
server.create('allocation', {
jobId: job.id,
clientStatus: 'running',
jobVersion: 3,
});
server.create('allocation', {
jobId: job.id,
clientStatus: 'failed',
jobVersion: 4,
});
server.create('allocation', {
jobId: job.id,
clientStatus: 'lost',
jobVersion: 5,
});
await visit(`/jobs/${job.id}`);
assert.dom('.job-status-panel').exists();
// We expect to see 4 represented-allocations, since that's the number in our groupTaskCount
assert
.dom('.ungrouped-allocs .represented-allocation')
.exists({ count: 4 });
// We expect 2 of them to be running, and one to be pending, since running/pending allocations superecede other clientStatuses
assert
.dom('.ungrouped-allocs .represented-allocation.running')
.exists({ count: 2 });
assert
.dom('.ungrouped-allocs .represented-allocation.pending')
.exists({ count: 1 });
// We expect the lone other allocation to be lost, since it has the highest jobVersion
assert
.dom('.ungrouped-allocs .represented-allocation.lost')
.exists({ count: 1 });
// We expect the job versions legend to show 3 at v5 (running, pending, and lost), and 1 at v3 (old running), and none at v4 (failed is not represented)
assert.dom('.job-status-panel .versions > ul > li').exists({ count: 2 });
assert
.dom('.job-status-panel .versions > ul > li > a[data-version="5"]')
.exists({ count: 1 });
assert
.dom('.job-status-panel .versions > ul > li > a[data-version="3"]')
.exists({ count: 1 });
assert
.dom('.job-status-panel .versions > ul > li > a[data-version="4"]')
.doesNotExist();
await percySnapshot(assert, {
percyCSS: `
.allocation-row td { display: none; }
`,
});
});
test('Status Panel groups allocations when they get past a threshold', async function (assert) {
assert.expect(6);
@ -470,6 +548,7 @@ module('Acceptance | job status panel', function (hooks) {
groupTaskCount,
activeDeployment: true,
shallow: true,
version: 0,
});
let state = server.create('task-state');
@ -628,4 +707,155 @@ module('Acceptance | job status panel', function (hooks) {
.exists('No match message is shown');
});
});
module('System jobs', function () {
test('System jobs show restarted but not rescheduled allocs', async function (assert) {
this.store = this.owner.lookup('service:store');
let job = server.create('job', {
status: 'running',
datacenters: ['*'],
type: 'system',
createAllocations: true,
allocStatusDistribution: {
running: 0.5,
failed: 0.5,
unknown: 0,
lost: 0,
},
noActiveDeployment: true,
shallow: true,
version: 0,
});
let state = server.create('task-state');
state.events = server.schema.taskEvents.where({ taskStateId: state.id });
server.schema.allocations.where({ jobId: job.id }).update({
taskStateIds: [state.id],
jobVersion: 0,
});
await visit(`/jobs/${job.id}`);
assert.dom('.job-status-panel').exists();
assert.dom('.failed-or-lost').exists({ count: 1 });
assert.dom('.failed-or-lost h4').hasText('Restarted');
assert
.dom('.failed-or-lost-link')
.hasText('0', 'Restarted cell at zero by default');
// A wild event appears! Change a recent task event to type "Restarting" in a task state:
this.store
.peekAll('job')
.objectAt(0)
.get('allocations')
.objectAt(0)
.get('states')
.objectAt(0)
.get('events')
.objectAt(0)
.set('type', 'Restarting');
await settled();
assert
.dom('.failed-or-lost-link')
.hasText(
'1',
'Restarted cell updates when a task event with type "Restarting" is added'
);
});
test('System jobs do not have a sense of Desired/Total allocs', async function (assert) {
this.store = this.owner.lookup('service:store');
server.db.nodes.remove();
server.createList('node', 3, {
status: 'ready',
drain: false,
schedulingEligibility: 'eligible',
});
let job = server.create('job', {
status: 'running',
datacenters: ['*'],
type: 'system',
createAllocations: false,
noActiveDeployment: true,
shallow: true,
version: 0,
});
// Create an allocation on this job for each node
server.schema.nodes.all().models.forEach((node) => {
server.create('allocation', {
jobId: job.id,
jobVersion: 0,
clientStatus: 'running',
nodeId: node.id,
});
});
await visit(`/jobs/${job.id}`);
let storedJob = await this.store.find(
'job',
JSON.stringify([job.id, 'default'])
);
// Weird Mirage thing: job summary factory is disconnected from its job and therefore allocations.
// So we manually create the number here.
let summary = await storedJob.get('summary');
summary
.get('taskGroupSummaries')
.objectAt(0)
.set(
'runningAllocs',
server.schema.allocations.where({
jobId: job.id,
clientStatus: 'running',
}).length
);
await settled();
assert.dom('.job-status-panel').exists();
assert.dom('.running-allocs-title').hasText(
`${
server.schema.allocations.where({
jobId: job.id,
clientStatus: 'running',
}).length
} Allocations Running`
);
// Let's bring another node online!
let newNode = server.create('node', {
status: 'ready',
drain: false,
schedulingEligibility: 'eligible',
});
// Let's expect our scheduler to have therefore added an alloc to it
server.create('allocation', {
jobId: job.id,
jobVersion: 0,
clientStatus: 'running',
nodeId: newNode.id,
});
summary
.get('taskGroupSummaries')
.objectAt(0)
.set(
'runningAllocs',
server.schema.allocations.where({
jobId: job.id,
clientStatus: 'running',
}).length
);
await settled();
assert.dom('.running-allocs-title').hasText('4 Allocations Running');
});
});
});

View File

@ -18,7 +18,7 @@ import { setupMirage } from 'ember-cli-mirage/test-support';
import JobDetail from 'nomad-ui/tests/pages/jobs/detail';
import setPolicy from 'nomad-ui/tests/utils/set-policy';
const jobTypesWithStatusPanel = ['service'];
const jobTypesWithStatusPanel = ['service', 'system'];
async function switchToHistorical() {
await JobDetail.statusModes.historical.click();
@ -56,7 +56,7 @@ export default function moduleForJob(
await JobDetail.visit({ id: `${job.id}@${job.namespace}` });
}
const hasClientStatus = ['system', 'sysbatch'].includes(job.type);
const hasClientStatus = ['sysbatch'].includes(job.type);
if (context === 'allocations' && hasClientStatus) {
await click("[data-test-accordion-summary-chart='allocation-status']");
}