MFlowCode · sbryngelson · Feb 19, 2026 · Feb 19, 2026 · Feb 20, 2026 · Feb 20, 2026
@@ -34,12 +34,13 @@ output_file="$job_slug.out"
 submit_output=$(sbatch <<EOT
 #!/bin/bash
 #SBATCH -J MFC-$job_slug            # Job name
-#SBATCH -A ENG160                  # charge account
+#SBATCH -A CFD154                  # charge account
 #SBATCH -N 1                       # Number of nodes required
 $sbatch_device_opts
-#SBATCH -t 05:59:00                # Duration of the job (Ex: 15 mins)
+#SBATCH -t 01:59:00                # Duration of the job
 #SBATCH -o$output_file             # Combined output and error messages file
-#SBATCH -p extended                # Extended partition for shorter queues
+#SBATCH -p batch                   # Batch partition (concurrent jobs)
+#SBATCH --qos=hackathon            # Hackathon QOS for batch access
 
 set -e
 set -x
@@ -50,6 +51,7 @@ echo "Running in $(pwd):"
 job_slug="$job_slug"
 job_device="$2"
 job_interface="$3"
+job_shard="$4"
 
 . ./mfc.sh load -c f -m $([ "$2" = "gpu" ] && echo "g" || echo "c")
 

@@ -13,8 +13,13 @@ if [ "$job_device" = "gpu" ]; then
     fi
 fi
 
+shard_opts=""
+if [ -n "$job_shard" ]; then
+    shard_opts="--shard $job_shard"
+fi
+
 if [ "$job_device" = "gpu" ]; then
-    ./mfc.sh test -v -a --rdma-mpi --max-attempts 3 -j $ngpus $device_opts -- -c frontier
+    ./mfc.sh test -v -a --rdma-mpi --max-attempts 3 -j $ngpus $device_opts $shard_opts -- -c frontier
 else
     ./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu -- -c frontier
 fi
@@ -34,12 +34,13 @@ output_file="$job_slug.out"
 submit_output=$(sbatch <<EOT
 #!/bin/bash
 #SBATCH -J MFC-$job_slug            # Job name
-#SBATCH -A ENG160                  # charge account
+#SBATCH -A CFD154                  # charge account
 #SBATCH -N 1                       # Number of nodes required
 $sbatch_device_opts
-#SBATCH -t 05:59:00                # Duration of the job (Ex: 15 mins)
+#SBATCH -t 01:59:00                # Duration of the job
 #SBATCH -o$output_file             # Combined output and error messages file
-#SBATCH -p extended                # Extended partition for shorter queues
+#SBATCH -p batch                   # Batch partition (concurrent jobs)
+#SBATCH --qos=hackathon            # Hackathon QOS for batch access
 
 set -e
 set -x
@@ -50,6 +51,7 @@ echo "Running in $(pwd):"
 job_slug="$job_slug"
 job_device="$2"
 job_interface="$3"
+job_shard="$4"
 
 . ./mfc.sh load -c famd -m $([ "$2" = "gpu" ] && echo "g" || echo "c")
 

@@ -13,8 +13,13 @@ if [ "$job_device" = "gpu" ]; then
     fi
 fi
 
+shard_opts=""
+if [ -n "$job_shard" ]; then
+    shard_opts="--shard $job_shard"
+fi
+
 if [ "$job_device" = "gpu" ]; then
-    ./mfc.sh test -v -a --max-attempts 3 -j $ngpus $device_opts -- -c frontier_amd
+    ./mfc.sh test -v -a --max-attempts 3 -j $ngpus $device_opts $shard_opts -- -c frontier_amd
 else
     ./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu -- -c frontier_amd
 fi
@@ -48,6 +48,7 @@ submit_output=$(sbatch <<EOT
 $sbatch_device_opts
 #SBATCH -t 03:00:00                # Duration of the job (Ex: 15 mins)
 #SBATCH -q embers                  # QOS Name
+#SBATCH --requeue                  # Auto-requeue on preemption
 #SBATCH -o$output_file             # Combined output and error messages file
 
 set -e

@@ -28,7 +28,7 @@ jobs:
 
       - name: Check Formatting
         run: |
-          ./mfc.sh format -j $(nproc)
+          ./mfc.sh format -j "$(nproc)"
           git diff --exit-code || (echo "::error::Code is not formatted. Run './mfc.sh format' locally." && exit 1)
 
       - name: Spell Check
@@ -138,19 +138,38 @@ jobs:
 
       - name: Build
         run:  |
-          /bin/bash mfc.sh test -v --dry-run -j $(nproc) --${{ matrix.debug }} --${{ matrix.mpi }} --${{ matrix.precision }} $TEST_ALL
+          /bin/bash mfc.sh test -v --dry-run -j "$(nproc)" --${{ matrix.debug }} --${{ matrix.mpi }} --${{ matrix.precision }} $TEST_ALL
         env:
           TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
 
       - name: Test
-        run:  |
-          /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT
+        run: |
+          rm -f tests/failed_uuids.txt
+          TEST_EXIT=0
+          /bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" $TEST_ALL $TEST_PCT || TEST_EXIT=$?
+
+          # Retry only if a small number of tests failed (sporadic failures)
+          if [ -f tests/failed_uuids.txt ]; then
+            NUM_FAILED=$(wc -l < tests/failed_uuids.txt)
+            if [ "$NUM_FAILED" -gt 0 ] && [ "$NUM_FAILED" -le 5 ]; then
+              FAILED=$(cat tests/failed_uuids.txt | tr '\n' ' ')
+              echo ""
+              echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ==="
+              echo ""
+              /bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" --only $FAILED $TEST_ALL || exit $?
+            else
+              echo "Too many failures ($NUM_FAILED) to retry — likely a real issue."
+              exit 1
+            fi
+          elif [ "$TEST_EXIT" -ne 0 ]; then
+            exit $TEST_EXIT
+          fi
         env:
           TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
           TEST_PCT: ${{ matrix.debug == 'debug' && '-% 20' || '' }}
 
   self:
-    name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
+    name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }}${{ matrix.shard != '' && format(' [{0}]', matrix.shard) || '' }})"
     if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true'
     needs: [lint-gate, file-changes]
     continue-on-error: false
@@ -164,50 +183,74 @@ jobs:
             cluster_name: 'Georgia Tech | Phoenix'
             device: 'gpu'
             interface: 'acc'
+            shard: ''
           - runner:       'gt'
             cluster:      'phoenix'
             cluster_name: 'Georgia Tech | Phoenix'
             device: 'gpu'
             interface: 'omp'
+            shard: ''
           - runner:       'gt'
             cluster:      'phoenix'
             cluster_name: 'Georgia Tech | Phoenix'
             device: 'cpu'
             interface: 'none'
-          # Frontier (ORNL) — build on login node, test via SLURM
+            shard: ''
+          # Frontier (ORNL) — build on login node, GPU tests sharded for batch partition
+          - runner:       'frontier'
+            cluster:      'frontier'
+            cluster_name: 'Oak Ridge | Frontier'
+            device: 'gpu'
+            interface: 'acc'
+            shard: '1/2'
           - runner:       'frontier'
             cluster:      'frontier'
             cluster_name: 'Oak Ridge | Frontier'
             device: 'gpu'
             interface: 'acc'
+            shard: '2/2'
           - runner:       'frontier'
             cluster:      'frontier'
             cluster_name: 'Oak Ridge | Frontier'
             device: 'gpu'
             interface: 'omp'
+            shard: '1/2'
+          - runner:       'frontier'
+            cluster:      'frontier'
+            cluster_name: 'Oak Ridge | Frontier'
+            device: 'gpu'
+            interface: 'omp'
+            shard: '2/2'
           - runner:       'frontier'
             cluster:      'frontier'
             cluster_name: 'Oak Ridge | Frontier'
             device: 'cpu'
             interface: 'none'
-          # Frontier AMD — build on login node, test via SLURM
+            shard: ''
+          # Frontier AMD — build on login node, GPU tests sharded for batch partition
           - runner:       'frontier'
             cluster:      'frontier_amd'
             cluster_name: 'Oak Ridge | Frontier (AMD)'
             device: 'gpu'
             interface: 'omp'
+            shard: '1/2'
+          - runner:       'frontier'
+            cluster:      'frontier_amd'
+            cluster_name: 'Oak Ridge | Frontier (AMD)'
+            device: 'gpu'
+            interface: 'omp'
+            shard: '2/2'
           - runner:       'frontier'
             cluster:      'frontier_amd'
             cluster_name: 'Oak Ridge | Frontier (AMD)'
             device: 'cpu'
             interface: 'none'
+            shard: ''
     runs-on:
       group:  phoenix
       labels: ${{ matrix.runner }}
     env:
       NODE_OPTIONS: ${{ matrix.cluster == 'phoenix' && '--max-old-space-size=2048' || '' }}
-      ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
-      ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
     steps:
       - name: Clone
         uses: actions/checkout@v4
@@ -216,10 +259,16 @@ jobs:
 
       - name: Build
         if:   matrix.cluster != 'phoenix'
-        run:  bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 3
+          retry_wait_seconds: 60
+          timeout_minutes: 480
+          command: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
+          on_retry_command: ./mfc.sh clean
 
       - name: Test
-        run:  bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }}
+        run:  bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.shard }}
 
       - name: Print Logs
         if:   always()

diff --git a/toolchain/mfc/cli/commands.py b/toolchain/mfc/cli/commands.py
@@ -452,6 +452,12 @@
             default=False,
             dest="dry_run",
         ),
+        Argument(
+            name="shard",
+            help="Run only a subset of tests (e.g., '1/2' for first half, '2/2' for second half).",
+            type=str,
+            default=None,
+        ),
     ],
     mutually_exclusive=[
         MutuallyExclusiveGroup(arguments=[

diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py
@@ -99,6 +99,14 @@ def __filter(cases_) -> typing.List[TestCase]:
         skipped_cases += example_cases
         cases = [case for case in cases if case not in example_cases]
 
+    if ARG("shard") is not None:
+        parts = ARG("shard").split("/")
+        if len(parts) != 2 or not all(p.isdigit() for p in parts) or int(parts[1]) < 1 or not 1 <= int(parts[0]) <= int(parts[1]):
+            raise MFCException(f"Invalid --shard '{ARG('shard')}': expected 'i/n' with 1 <= i <= n (e.g., '1/2').")
+        shard_idx, shard_count = int(parts[0]), int(parts[1])
+        skipped_cases += [c for i, c in enumerate(cases) if i % shard_count != shard_idx - 1]
+        cases = [c for i, c in enumerate(cases) if i % shard_count == shard_idx - 1]
+
     if ARG("percent") == 100:
         return cases, skipped_cases
 
@@ -182,6 +190,11 @@ def test():
 
     # Check if we aborted due to high failure rate
     if abort_tests.is_set():
+        # Clean up stale failed_uuids.txt so CI doesn't retry wrong tests
+        failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt")
+        if os.path.exists(failed_uuids_path):
+            os.remove(failed_uuids_path)
+
         total_completed = nFAIL + nPASS
         cons.print()
         cons.unindent()
@@ -206,6 +219,15 @@ def test():
     # Build the summary report
     _print_test_summary(nPASS, nFAIL, nSKIP, minutes, seconds, failed_tests, skipped_cases)
 
+    # Write failed UUIDs to file for CI retry logic
+    failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt")
+    if failed_tests:
+        with open(failed_uuids_path, "w") as f:
+            for test_info in failed_tests:
+                f.write(test_info['uuid'] + "\n")
+    elif os.path.exists(failed_uuids_path):
+        os.remove(failed_uuids_path)
+
     exit(nFAIL)