[ui, deployments] Job status for System Jobs (#17046)

* System jobs get a panel and lost status reinstated * Leveraging nodes and not worrying about rescheds for system jobs * Consistency w restarted as well * Text shadow removed and early return where possible * System jobs added to the Historical Click list * System alloc and client summary panels removed * Bones of some new system jobs tests * [ui, deployments] handle node read permissions for system job panel (#17073) * Do the next-best-thing when we cant read nodes for system jobs * Whitespace control handlebars expr * Simplifies system jobs to not attempt to show a desired count, since it is a particularly complex number depending on constraints, number of nodes, etc. * [ui, deployments] Fix order in which allocations are ascribed to the status chart (#17063) * Discovery of alloc.isOld * Correct sorting and better types * A more honest walk-back that prioritizes running and pending allocs first * Test scenario for descending-order allocs to show * isOld mandates that we set a job version for our created job. Could also do this in the factory but maybe side-effecty * Type simplification * Fixed up a test that needed system job summary to be updated * Tests for modifications to the job summary * Explicitly mark the service jobs in test as not-deploying
2023-05-05 16:25:21 -04:00 · 2023-05-05 16:25:21 -04:00 · 2fbbac5dd8
parent 2cf27389ad
commit 2fbbac5dd8
11 changed files with 378 additions and 92 deletions
--- a/ui/app/components/job-status/panel.hbs
+++ b/ui/app/components/job-status/panel.hbs
@ -1,5 +1,5 @@
 {{#if this.isActivelyDeploying}}
  <JobStatus::Panel::Deploying @job={{@job}} @handleError={{@handleError}} />
 {{else}}
-  <JobStatus::Panel::Steady @job={{@job}} @statusMode={{@statusMode}} @setStatusMode={{@setStatusMode}} />
+  <JobStatus::Panel::Steady @job={{@job}} @statusMode={{@statusMode}} @setStatusMode={{@setStatusMode}} @nodes={{this.nodes}} />
 {{/if}}
--- a/ui/app/components/job-status/panel.js
+++ b/ui/app/components/job-status/panel.js
@ -1,8 +1,18 @@
 // @ts-check
 import Component from '@glimmer/component';
+import { inject as service } from '@ember/service';

 export default class JobStatusPanelComponent extends Component {
+  @service store;
+
  get isActivelyDeploying() {
    return this.args.job.get('latestDeployment.isRunning');
  }
+
+  get nodes() {
+    if (!this.args.job.get('hasClientStatus')) {
+      return [];
+    }
+    return this.store.peekAll('node');
+  }
 }
--- a/ui/app/components/job-status/panel/deploying.js
+++ b/ui/app/components/job-status/panel/deploying.js
@ -30,9 +30,7 @@ export default class JobStatusPanelDeployingComponent extends Component {
  // allocations we can track throughout a deployment
  establishOldAllocBlockIDs() {
    this.oldVersionAllocBlockIDs = this.job.allocations.filter(
-      (a) =>
-        a.clientStatus === 'running' &&
-        a.jobVersion !== this.deployment.get('versionNumber')
+      (a) => a.clientStatus === 'running' && a.isOld
    );
  }

@ -84,7 +82,7 @@ export default class JobStatusPanelDeployingComponent extends Component {
  get newVersionAllocBlocks() {
    let availableSlotsToFill = this.desiredTotal;
    let allocationsOfDeploymentVersion = this.job.allocations.filter(
-      (a) => a.jobVersion === this.deployment.get('versionNumber')
+      (a) => !a.isOld
    );

    let allocationCategories = this.allocTypes.reduce((categories, type) => {
@ -153,19 +151,11 @@ export default class JobStatusPanelDeployingComponent extends Component {
  }

  get rescheduledAllocs() {
-    return this.job.allocations.filter(
-      (a) =>
-        a.jobVersion === this.job.latestDeployment.get('versionNumber') &&
-        a.hasBeenRescheduled
-    );
+    return this.job.allocations.filter((a) => !a.isOld && a.hasBeenRescheduled);
  }

  get restartedAllocs() {
-    return this.job.allocations.filter(
-      (a) =>
-        a.jobVersion === this.job.latestDeployment.get('versionNumber') &&
-        a.hasBeenRestarted
-    );
+    return this.job.allocations.filter((a) => !a.isOld && a.hasBeenRestarted);
  }

  // #region legend
--- a/ui/app/components/job-status/panel/steady.hbs
+++ b/ui/app/components/job-status/panel/steady.hbs
@ -21,7 +21,14 @@
    {{#if (eq @statusMode "historical")}}
      <JobPage::Parts::SummaryChart @job={{@job}} />
    {{else}}
-      <h3 class="title is-4 running-allocs-title"><strong>{{@job.runningAllocs}}/{{this.totalAllocs}}</strong> Allocations Running</h3>
+      <h3 class="title is-4 running-allocs-title">
+        <strong>
+          {{@job.runningAllocs ~}}
+          {{#unless this.atMostOneAllocPerNode ~}}
+            /{{this.totalAllocs}}
+          {{/unless}}
+        </strong>
+        {{pluralize "Allocation" @job.runningAllocs}} Running</h3>
      <JobStatus::AllocationStatusRow @allocBlocks={{this.allocBlocks}} @steady={{true}} />

      <div class="legend-and-summary">
@ -41,12 +48,14 @@
          {{/each}}
        </legend>

-        <JobStatus::FailedOrLost
-          @allocs={{this.rescheduledAllocs}}
-          @job={{@job}}
-          @title="Rescheduled"
-          @description="Allocations that have been rescheduled, on another node if possible, due to failure"
-        />
+        {{#if this.supportsRescheduling}}
+          <JobStatus::FailedOrLost
+            @allocs={{this.rescheduledAllocs}}
+            @job={{@job}}
+            @title="Rescheduled"
+            @description="Allocations that have been rescheduled, on another node if possible, due to failure"
+          />
+        {{/if}}

        <JobStatus::FailedOrLost
          @allocs={{this.restartedAllocs}}
@ -60,7 +69,7 @@
          <ul>
            {{#each-in this.versions as |version allocs|}}
              <li>
-                <LinkTo @route="jobs.job.allocations" @model={{@job}} @query={{hash version=(concat '[' version ']')    status=(concat '["running", "pending", "failed"]')         }}>
+                <LinkTo data-version={{version}} @route="jobs.job.allocations" @model={{@job}} @query={{hash version=(concat '[' version ']')    status=(concat '["running", "pending", "failed"]')         }}>
                  <Hds::Badge @text={{concat "v" version}} />
                  <Hds::BadgeCount @text={{allocs.length}} @type="inverted" />
                </LinkTo>
--- a/ui/app/components/job-status/panel/steady.js
+++ b/ui/app/components/job-status/panel/steady.js
@ -21,32 +21,91 @@ export default class JobStatusPanelSteadyComponent extends Component {
    };
  });

+  /**
+   * @typedef {Object} HealthStatus
+   * @property {Array} nonCanary
+   * @property {Array} canary
+   */
+
+  /**
+   * @typedef {Object} AllocationStatus
+   * @property {HealthStatus} healthy
+   * @property {HealthStatus} unhealthy
+   */
+
+  /**
+   * @typedef {Object} AllocationBlock
+   * @property {AllocationStatus} [RUNNING]
+   * @property {AllocationStatus} [PENDING]
+   * @property {AllocationStatus} [FAILED]
+   * @property {AllocationStatus} [LOST]
+   * @property {AllocationStatus} [UNPLACED]
+   */
+
+  /**
+   * Looks through running/pending allocations with the aim of filling up your desired number of allocations.
+   * If any desired remain, it will walk backwards through job versions and other allocation types to build
+   * a picture of the job's overall status.
+   *
+   * @returns {AllocationBlock} An object containing healthy non-canary allocations
+   *                            for each clientStatus.
+   */
  get allocBlocks() {
    let availableSlotsToFill = this.totalAllocs;
-    // Only fill up to 100% of totalAllocs. Once we've filled up, we can stop counting.
-    let allocationsOfShowableType = this.allocTypes.reduce((blocks, type) => {
-      const jobAllocsOfType = this.args.job.allocations
-        .sortBy('jobVersion') // Try counting from latest deployment's allocs and work backwards if needed
-        .reverse()
-        .filterBy('clientStatus', type.label);
-      if (availableSlotsToFill > 0) {
-        blocks[type.label] = {
-          healthy: {
-            nonCanary: Array(
-              Math.min(availableSlotsToFill, jobAllocsOfType.length)
-            )
-              .fill()
-              .map((_, i) => {
-                return jobAllocsOfType[i];
-              }),
-          },
-        };
-        availableSlotsToFill -= blocks[type.label].healthy.nonCanary.length;
-      } else {
-        blocks[type.label] = { healthy: { nonCanary: [] } };
+
+    // Initialize allocationsOfShowableType with empty arrays for each clientStatus
+    /**
+     * @type {AllocationBlock}
+     */
+    let allocationsOfShowableType = this.allocTypes.reduce(
+      (accumulator, type) => {
+        accumulator[type.label] = { healthy: { nonCanary: [] } };
+        return accumulator;
+      },
+      {}
+    );
+
+    // First accumulate the Running/Pending allocations
+    for (const alloc of this.job.allocations.filter(
+      (a) => a.clientStatus === 'running' || a.clientStatus === 'pending'
+    )) {
+      if (availableSlotsToFill === 0) {
+        break;
      }
-      return blocks;
-    }, {});
+
+      const status = alloc.clientStatus;
+      allocationsOfShowableType[status].healthy.nonCanary.push(alloc);
+      availableSlotsToFill--;
+    }
+
+    // Sort all allocs by jobVersion in descending order
+    const sortedAllocs = this.args.job.allocations
+      .filter(
+        (a) => a.clientStatus !== 'running' && a.clientStatus !== 'pending'
+      )
+      .sortBy('jobVersion')
+      .reverse();
+
+    // Iterate over the sorted allocs
+    for (const alloc of sortedAllocs) {
+      if (availableSlotsToFill === 0) {
+        break;
+      }
+
+      const status = alloc.clientStatus;
+      // If the alloc has another clientStatus, add it to the corresponding list
+      // as long as we haven't reached the totalAllocs limit for that clientStatus
+      if (
+        this.allocTypes.map(({ label }) => label).includes(status) &&
+        allocationsOfShowableType[status].healthy.nonCanary.length <
+          this.totalAllocs
+      ) {
+        allocationsOfShowableType[status].healthy.nonCanary.push(alloc);
+        availableSlotsToFill--;
+      }
+    }
+
+    // Handle unplaced allocs
    if (availableSlotsToFill > 0) {
      allocationsOfShowableType['unplaced'] = {
        healthy: {
@ -58,16 +117,26 @@ export default class JobStatusPanelSteadyComponent extends Component {
        },
      };
    }
+
    return allocationsOfShowableType;
  }

-  // TODO: eventually we will want this from a new property on a job.
-  get totalAllocs() {
-    // v----- Experimental method: Count all allocs. Good for testing but not a realistic representation of "Desired"
-    // return this.allocTypes.reduce((sum, type) => sum + this.args.job[type.property], 0);
+  get nodes() {
+    return this.args.nodes;
+  }

-    // v----- Realistic method: Tally a job's task groups' "count" property
-    return this.args.job.taskGroups.reduce((sum, tg) => sum + tg.count, 0);
+  get totalAllocs() {
+    if (this.args.job.type === 'service') {
+      return this.args.job.taskGroups.reduce((sum, tg) => sum + tg.count, 0);
+    } else if (this.atMostOneAllocPerNode) {
+      return this.args.job.allocations.uniqBy('nodeID').length;
+    } else {
+      return this.args.job.count; // TODO: this is probably not the correct totalAllocs count for any type.
+    }
+  }
+
+  get atMostOneAllocPerNode() {
+    return this.args.job.type === 'system';
  }

  get versions() {
@ -86,18 +155,14 @@ export default class JobStatusPanelSteadyComponent extends Component {
  }

  get rescheduledAllocs() {
-    return this.job.allocations.filter(
-      (a) =>
-        a.jobVersion === this.job.latestDeployment.get('versionNumber') &&
-        a.hasBeenRescheduled
-    );
+    return this.job.allocations.filter((a) => !a.isOld && a.hasBeenRescheduled);
  }

  get restartedAllocs() {
-    return this.job.allocations.filter(
-      (a) =>
-        a.jobVersion === this.job.latestDeployment.get('versionNumber') &&
-        a.hasBeenRestarted
-    );
+    return this.job.allocations.filter((a) => !a.isOld && a.hasBeenRestarted);
+  }
+
+  get supportsRescheduling() {
+    return this.job.type !== 'system';
  }
 }
--- a/ui/app/models/allocation.js
+++ b/ui/app/models/allocation.js
@ -32,6 +32,7 @@ export default class Allocation extends Model {
  @belongsTo('job') job;
  @belongsTo('node') node;
  @attr('string') namespace;
+  @attr('string') nodeID;
  @attr('string') name;
  @attr('string') taskGroupName;
  @fragment('resources') resources;
--- a/ui/app/styles/components/job-status-panel.scss
+++ b/ui/app/styles/components/job-status-panel.scss
@ -159,7 +159,6 @@
        // TODO: we eventually want to establish a minimum width here. However, we need to also include this in the allocation-status-block width computation.
        font-size: 0.8rem;
        font-weight: bold;
-        text-shadow: 1px 1px 2px rgba(0, 0, 0, 0.3);
        width: 100%;

        & > .rest-count {
--- a/ui/app/templates/components/job-page/system.hbs
+++ b/ui/app/templates/components/job-page/system.hbs
@ -9,8 +9,7 @@
    <jobPage.ui.Title />
    <jobPage.ui.StatsBox />
    <jobPage.ui.DasRecommendations />
-    <jobPage.ui.JobClientStatusSummary />
-    <jobPage.ui.Summary @forceCollapsed="true" />
+    <jobPage.ui.StatusPanel @statusMode={{@statusMode}} @setStatusMode={{@setStatusMode}} />
    <jobPage.ui.PlacementFailures />
    <jobPage.ui.TaskGroups @sortProperty={{@sortProperty}} @sortDescending={{@sortDescending}} />
    <jobPage.ui.RecentAllocations @activeTask={{@activeTask}} @setActiveTaskQueryParam={{@setActiveTaskQueryParam}} />
--- a/ui/tests/acceptance/job-detail-test.js
+++ b/ui/tests/acceptance/job-detail-test.js
@ -21,30 +21,11 @@ moduleForJob('Acceptance | job detail (batch)', 'allocations', () =>
 );

 moduleForJob('Acceptance | job detail (system)', 'allocations', () =>
-  server.create('job', { type: 'system', shallow: true })
-);
-
-moduleForJobWithClientStatus(
-  'Acceptance | job detail with client status (system)',
-  () =>
-    server.create('job', {
-      status: 'running',
-      datacenters: ['dc1'],
-      type: 'system',
-      createAllocations: false,
-    })
-);
-
-moduleForJobWithClientStatus(
-  'Acceptance | job detail with client status (system with wildcard dc)',
-  () =>
-    server.create('job', {
-      id: 'system-wildcard-dc',
-      status: 'running',
-      datacenters: ['canada-*-1'],
-      type: 'system',
-      createAllocations: false,
-    })
+  server.create('job', {
+    type: 'system',
+    shallow: true,
+    noActiveDeployment: true,
+  })
 );

 moduleForJob('Acceptance | job detail (sysbatch)', 'allocations', () =>
@ -244,7 +225,7 @@ moduleForJob(
 moduleForJob(
  'Acceptance | job detail (service)',
  'allocations',
-  () => server.create('job', { type: 'service' }),
+  () => server.create('job', { type: 'service', noActiveDeployment: true }),
  {
    'the subnav links to deployment': async (job, assert) => {
      await JobDetail.tabFor('deployments').visit();
@ -285,6 +266,7 @@ module('Acceptance | job detail (with namespaces)', function (hooks) {
      type: 'service',
      status: 'running',
      namespaceId: server.db.namespaces[1].name,
+      noActiveDeployment: true,
    });
    server.createList('job', 3, {
      namespaceId: server.db.namespaces[0].name,
@ -433,6 +415,7 @@ module('Acceptance | job detail (with namespaces)', function (hooks) {
      namespaceId: server.db.namespaces[1].name,
      groupsCount: 3,
      createRecommendations: true,
+      noActiveDeployment: true,
    });

    window.localStorage.nomadTokenSecret = managementToken.secretId;
--- a/ui/tests/acceptance/job-status-panel-test.js
+++ b/ui/tests/acceptance/job-status-panel-test.js
@ -176,6 +176,84 @@ module('Acceptance | job status panel', function (hooks) {
    });
  });

+  test('After running/pending allocations are covered, fill in allocs by jobVersion, descending', async function (assert) {
+    assert.expect(9);
+    let job = server.create('job', {
+      status: 'running',
+      datacenters: ['*'],
+      type: 'service',
+      resourceSpec: ['M: 256, C: 500'], // a single group
+      createAllocations: false,
+      groupTaskCount: 4,
+      shallow: true,
+      version: 5,
+    });
+
+    server.create('allocation', {
+      jobId: job.id,
+      clientStatus: 'running',
+      jobVersion: 5,
+    });
+    server.create('allocation', {
+      jobId: job.id,
+      clientStatus: 'pending',
+      jobVersion: 5,
+    });
+    server.create('allocation', {
+      jobId: job.id,
+      clientStatus: 'running',
+      jobVersion: 3,
+    });
+    server.create('allocation', {
+      jobId: job.id,
+      clientStatus: 'failed',
+      jobVersion: 4,
+    });
+    server.create('allocation', {
+      jobId: job.id,
+      clientStatus: 'lost',
+      jobVersion: 5,
+    });
+
+    await visit(`/jobs/${job.id}`);
+    assert.dom('.job-status-panel').exists();
+
+    // We expect to see 4 represented-allocations, since that's the number in our groupTaskCount
+    assert
+      .dom('.ungrouped-allocs .represented-allocation')
+      .exists({ count: 4 });
+
+    // We expect 2 of them to be running, and one to be pending, since running/pending allocations superecede other clientStatuses
+    assert
+      .dom('.ungrouped-allocs .represented-allocation.running')
+      .exists({ count: 2 });
+    assert
+      .dom('.ungrouped-allocs .represented-allocation.pending')
+      .exists({ count: 1 });
+
+    // We expect the lone other allocation to be lost, since it has the highest jobVersion
+    assert
+      .dom('.ungrouped-allocs .represented-allocation.lost')
+      .exists({ count: 1 });
+
+    // We expect the job versions legend to show 3 at v5 (running, pending, and lost), and 1 at v3 (old running), and none at v4 (failed is not represented)
+    assert.dom('.job-status-panel .versions > ul > li').exists({ count: 2 });
+    assert
+      .dom('.job-status-panel .versions > ul > li > a[data-version="5"]')
+      .exists({ count: 1 });
+    assert
+      .dom('.job-status-panel .versions > ul > li > a[data-version="3"]')
+      .exists({ count: 1 });
+    assert
+      .dom('.job-status-panel .versions > ul > li > a[data-version="4"]')
+      .doesNotExist();
+    await percySnapshot(assert, {
+      percyCSS: `
+        .allocation-row td { display: none; }
+      `,
+    });
+  });
+
  test('Status Panel groups allocations when they get past a threshold', async function (assert) {
    assert.expect(6);

@ -470,6 +548,7 @@ module('Acceptance | job status panel', function (hooks) {
      groupTaskCount,
      activeDeployment: true,
      shallow: true,
+      version: 0,
    });

    let state = server.create('task-state');
@ -628,4 +707,155 @@ module('Acceptance | job status panel', function (hooks) {
        .exists('No match message is shown');
    });
  });
+
+  module('System jobs', function () {
+    test('System jobs show restarted but not rescheduled allocs', async function (assert) {
+      this.store = this.owner.lookup('service:store');
+
+      let job = server.create('job', {
+        status: 'running',
+        datacenters: ['*'],
+        type: 'system',
+        createAllocations: true,
+        allocStatusDistribution: {
+          running: 0.5,
+          failed: 0.5,
+          unknown: 0,
+          lost: 0,
+        },
+        noActiveDeployment: true,
+        shallow: true,
+        version: 0,
+      });
+
+      let state = server.create('task-state');
+      state.events = server.schema.taskEvents.where({ taskStateId: state.id });
+      server.schema.allocations.where({ jobId: job.id }).update({
+        taskStateIds: [state.id],
+        jobVersion: 0,
+      });
+
+      await visit(`/jobs/${job.id}`);
+      assert.dom('.job-status-panel').exists();
+      assert.dom('.failed-or-lost').exists({ count: 1 });
+      assert.dom('.failed-or-lost h4').hasText('Restarted');
+      assert
+        .dom('.failed-or-lost-link')
+        .hasText('0', 'Restarted cell at zero by default');
+
+      // A wild event appears! Change a recent task event to type "Restarting" in a task state:
+      this.store
+        .peekAll('job')
+        .objectAt(0)
+        .get('allocations')
+        .objectAt(0)
+        .get('states')
+        .objectAt(0)
+        .get('events')
+        .objectAt(0)
+        .set('type', 'Restarting');
+
+      await settled();
+
+      assert
+        .dom('.failed-or-lost-link')
+        .hasText(
+          '1',
+          'Restarted cell updates when a task event with type "Restarting" is added'
+        );
+    });
+
+    test('System jobs do not have a sense of Desired/Total allocs', async function (assert) {
+      this.store = this.owner.lookup('service:store');
+
+      server.db.nodes.remove();
+
+      server.createList('node', 3, {
+        status: 'ready',
+        drain: false,
+        schedulingEligibility: 'eligible',
+      });
+
+      let job = server.create('job', {
+        status: 'running',
+        datacenters: ['*'],
+        type: 'system',
+        createAllocations: false,
+        noActiveDeployment: true,
+        shallow: true,
+        version: 0,
+      });
+
+      // Create an allocation on this job for each node
+      server.schema.nodes.all().models.forEach((node) => {
+        server.create('allocation', {
+          jobId: job.id,
+          jobVersion: 0,
+          clientStatus: 'running',
+          nodeId: node.id,
+        });
+      });
+
+      await visit(`/jobs/${job.id}`);
+      let storedJob = await this.store.find(
+        'job',
+        JSON.stringify([job.id, 'default'])
+      );
+      // Weird Mirage thing: job summary factory is disconnected from its job and therefore allocations.
+      // So we manually create the number here.
+      let summary = await storedJob.get('summary');
+      summary
+        .get('taskGroupSummaries')
+        .objectAt(0)
+        .set(
+          'runningAllocs',
+          server.schema.allocations.where({
+            jobId: job.id,
+            clientStatus: 'running',
+          }).length
+        );
+
+      await settled();
+
+      assert.dom('.job-status-panel').exists();
+      assert.dom('.running-allocs-title').hasText(
+        `${
+          server.schema.allocations.where({
+            jobId: job.id,
+            clientStatus: 'running',
+          }).length
+        } Allocations Running`
+      );
+
+      // Let's bring another node online!
+      let newNode = server.create('node', {
+        status: 'ready',
+        drain: false,
+        schedulingEligibility: 'eligible',
+      });
+
+      // Let's expect our scheduler to have therefore added an alloc to it
+      server.create('allocation', {
+        jobId: job.id,
+        jobVersion: 0,
+        clientStatus: 'running',
+        nodeId: newNode.id,
+      });
+
+      summary
+        .get('taskGroupSummaries')
+        .objectAt(0)
+        .set(
+          'runningAllocs',
+          server.schema.allocations.where({
+            jobId: job.id,
+            clientStatus: 'running',
+          }).length
+        );
+
+      await settled();
+
+      assert.dom('.running-allocs-title').hasText('4 Allocations Running');
+    });
+  });
 });
--- a/ui/tests/helpers/module-for-job.js
+++ b/ui/tests/helpers/module-for-job.js
@ -18,7 +18,7 @@ import { setupMirage } from 'ember-cli-mirage/test-support';
 import JobDetail from 'nomad-ui/tests/pages/jobs/detail';
 import setPolicy from 'nomad-ui/tests/utils/set-policy';

-const jobTypesWithStatusPanel = ['service'];
+const jobTypesWithStatusPanel = ['service', 'system'];

 async function switchToHistorical() {
  await JobDetail.statusModes.historical.click();
@ -56,7 +56,7 @@ export default function moduleForJob(
        await JobDetail.visit({ id: `${job.id}@${job.namespace}` });
      }

-      const hasClientStatus = ['system', 'sysbatch'].includes(job.type);
+      const hasClientStatus = ['sysbatch'].includes(job.type);
      if (context === 'allocations' && hasClientStatus) {
        await click("[data-test-accordion-summary-chart='allocation-status']");
      }