Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

Table of Contents

These scripts and log files can be found in ~cpo/problems/crowdstrike/.

First Iteration

Submitted the following script on s3df multiple times (also an identical script with constraint "CrowdStrike_off"):

...

Code Block
#!/bin/bash

#SBATCH --dependency=singleton
#SBATCH --job-name=cson
#SBATCH --partition=roma
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=120
#SBATCH --output=%j.log
#SBATCH --constraint=CrowdStrike_on
#SBATCH --account=lcls:prjdat21

echo "***cson " `hostname`
/sdf/group/scs/tools/free-pagecache
time mpirun python mfxl1028222.py

Code Block
import globtime
logsstartup_begin = globtime.globtime('*.log')
logs.sort() # put them in time order
nodes = []
ontimes = []
offtimes = []
print(logs)
for log in logsfrom psana import *
import sys

ds = MPIDataSource('exp=mfxl1028222:run=90:smd')
det = Detector('epix10k2M')
ngood=0
for nevt,evt in enumerate(ds.events()):
    fcalib = open(log,'r'det.calib(evt)
    onif =calib False
is    for line in fnot None:
        if '***' in line:
 ngood+=1
           if 'cson' in lineif nevt==0:
        startup_end        on=True= time.time()
        start    node = linetime.splittime()[1]
tottime = time.time()-start
#print('processed',ngood,tottime,tottime/(ngood-1)) # we ignored first event so
 -1
#print('startup',startup_end-startup_begin)
Code Block
import glob
logs = glob.glob('iter2/*.log')
logs.sort() # put them in time order
nodes = []
ontimes = []
offtimes = []
onnodes = []
offnodes = []
#print('***',logs)

def nodecount(nodelist):
    uniquenodes = set(nodelistif 'real' in line:
            timestr = line.split()[1]
            hours_minutes = timestr.split('m')
            minutes = float(hours_minutes[0])
    for n in uniquenodes:
      seconds = float(hours_minutes[1][:-1])
    print(n,nodelist.count(n))

for log in logs:
    f     time = minutes*60+seconds= open(log,'r')
    on = False
    #iffor nodeline in nodesf:
    #     print('skipping duplicate node',node)
if '***' in line:
       #    continue
 if 'cson'  nodes.append(node)in line:
    if on:
        ontimes.append(time)   on=True
    else:
        node offtimes= line.appendsplit(time)[1]
import  numpy as np
mean = []
err_on_mean = []
for timesif 'real' in [offtimes,ontimes]line:
     print(times)
      mean.append(np.mean(times))
 timestr = line.split()[1]
     err_on_mean.append(np.std(times)/np.sqrt(len(times)))
diff_err = np.sqrt(err_on_mean[0]**2+err_on_mean[1]**2)
diff = mean[1]-mean[0]
print('Fractional change:',diff/mean[0],'+-',diff_err/mean[0])
import matplotlib.pyplot as plt
plt.hist([ontimes,offtimes])
plt.show()

Output:

Last line of output:

Code Block
Fractional change: 0.15766088705149858 +- 0.01466574691536575

Plot:

       hours_minutes = timestr.split('m')
            minutes = float(hours_minutes[0])
            seconds = float(hours_minutes[1][:-1])
            time = minutes*60+seconds
    #if node in nodes:
    #    print('skipping duplicate node',node)
    #    continue
    nodes.append(node)
    if on:
        ontimes.append(time)
        onnodes.append(node)
    else:
        offtimes.append(time)
        offnodes.append(node)
import numpy as np
mean = []
err_on_mean = []
for times in [offtimes,ontimes]:
    #print(times)
    mean.append(np.mean(times))
    err_on_mean.append(np.std(times)/np.sqrt(len(times)))
diff_err = np.sqrt(err_on_mean[0]**2+err_on_mean[1]**2)
diff = mean[1]-mean[0]
print('*** offnodes job count:')
nodecount(offnodes)
print('*** onnodes job count:')
nodecount(onnodes)
print('Fractional change:',diff/mean[0],'+-',diff_err/mean[0])
import matplotlib.pyplot as plt
plt.hist([ontimes,offtimes])
plt.show()

Output:

*** offnodes job count:
sdfrome039 34
sdfrome087 2
sdfrome037 25
sdfrome042 39
*** onnodes job count:
sdfrome007 100
Fractional change: 0.15766088705149858 +- 0.01466574691536575

Image Added

Third Iteration

*** offnodes job count:
sdfrome035 14
sdfrome114 35
sdfrome087 6
sdfrome042 42
sdfrome073 1
sdfrome036 2
*** onnodes job count:
sdfrome019 34
sdfrome004 2
sdfrome021 64
Fractional change: 0.2588939230105063 +- 0.016541549294602324

Image Added

Fourth Iteration

*** offnodes job count:
sdfrome042 48
sdfrome043 14
sdfrome111 1
sdfrome039 27
sdfrome086 10
*** onnodes job count:
sdfrome016 100
Fractional change: 0.2359417044882193 +- 0.015870310667490246

Image Added

Update 2024-09-15


We repeated the test on roma partition, 105 iterations each with constraint Crowdstrike_on/Crowdstrike_off alternating. This test was performed during a period of low utilization of the rome partition with no competing network or storage contention.

Measured runtime for psana analysis of mfxl1028222 run=29:smd on exclusive node with 120 cores.

Note: the previous measurements were done with run=90:smd. We chose run=29:smd, because it has more events and therefore takes longer, minimizing effects related to job startup.
Image Added

Image RemovedFractional change: 0.24461288024797354 +- 0.001079561972505891