Page History
Table of Contents |
---|
These scripts and log files can be found in ~cpo/problems/crowdstrike/.
First Iteration
Submitted the following script on s3df multiple times (also an identical script with constraint "CrowdStrike_off"):
...
Code Block |
---|
#!/bin/bash #SBATCH --dependency=singleton #SBATCH --job-name=cson #SBATCH --partition=roma #SBATCH --nodes=1 #SBATCH --ntasks-per-node=120 #SBATCH --output=%j.log #SBATCH --constraint=CrowdStrike_on #SBATCH --account=lcls:prjdat21 echo "***cson " `hostname` /sdf/group/scs/tools/free-pagecache time mpirun python mfxl1028222.py |
Code Block |
---|
import globtime logsstartup_begin = globtime.globtime('*.log') logs.sort() # put them in time order nodes = [] ontimes = [] offtimes = [] print(logs) for log in logsfrom psana import * import sys ds = MPIDataSource('exp=mfxl1028222:run=90:smd') det = Detector('epix10k2M') ngood=0 for nevt,evt in enumerate(ds.events()): fcalib = open(log,'r'det.calib(evt) onif =calib False is for line in fnot None: if '***' in line: ngood+=1 if 'cson' in lineif nevt==0: startup_end on=True= time.time() start node = linetime.splittime()[1] tottime = time.time()-start #print('processed',ngood,tottime,tottime/(ngood-1)) # we ignored first event so -1 #print('startup',startup_end-startup_begin) |
Code Block |
---|
import glob logs = glob.glob('iter2/*.log') logs.sort() # put them in time order nodes = [] ontimes = [] offtimes = [] onnodes = [] offnodes = [] #print('***',logs) def nodecount(nodelist): uniquenodes = set(nodelistif 'real' in line: timestr = line.split()[1] hours_minutes = timestr.split('m') minutes = float(hours_minutes[0]) for n in uniquenodes: seconds = float(hours_minutes[1][:-1]) print(n,nodelist.count(n)) for log in logs: f time = minutes*60+seconds= open(log,'r') on = False #iffor nodeline in nodesf: # print('skipping duplicate node',node) if '***' in line: # continue if 'cson' nodes.append(node)in line: if on: ontimes.append(time) on=True else: node offtimes= line.appendsplit(time)[1] import numpy as np mean = [] err_on_mean = [] for timesif 'real' in [offtimes,ontimes]line: print(times) mean.append(np.mean(times)) timestr = line.split()[1] err_on_mean.append(np.std(times)/np.sqrt(len(times))) diff_err = np.sqrt(err_on_mean[0]**2+err_on_mean[1]**2) diff = mean[1]-mean[0] print('Fractional change:',diff/mean[0],'+-',diff_err/mean[0]) import matplotlib.pyplot as plt plt.hist([ontimes,offtimes]) plt.show() |
Output:
Last line of output:
Code Block |
---|
Fractional change: 0.15766088705149858 +- 0.01466574691536575 |
Plot:
hours_minutes = timestr.split('m')
minutes = float(hours_minutes[0])
seconds = float(hours_minutes[1][:-1])
time = minutes*60+seconds
#if node in nodes:
# print('skipping duplicate node',node)
# continue
nodes.append(node)
if on:
ontimes.append(time)
onnodes.append(node)
else:
offtimes.append(time)
offnodes.append(node)
import numpy as np
mean = []
err_on_mean = []
for times in [offtimes,ontimes]:
#print(times)
mean.append(np.mean(times))
err_on_mean.append(np.std(times)/np.sqrt(len(times)))
diff_err = np.sqrt(err_on_mean[0]**2+err_on_mean[1]**2)
diff = mean[1]-mean[0]
print('*** offnodes job count:')
nodecount(offnodes)
print('*** onnodes job count:')
nodecount(onnodes)
print('Fractional change:',diff/mean[0],'+-',diff_err/mean[0])
import matplotlib.pyplot as plt
plt.hist([ontimes,offtimes])
plt.show() |
Output:
*** offnodes job count:
sdfrome039 34
sdfrome087 2
sdfrome037 25
sdfrome042 39
*** onnodes job count:
sdfrome007 100
Fractional change: 0.15766088705149858 +- 0.01466574691536575
Third Iteration
*** offnodes job count:
sdfrome035 14
sdfrome114 35
sdfrome087 6
sdfrome042 42
sdfrome073 1
sdfrome036 2
*** onnodes job count:
sdfrome019 34
sdfrome004 2
sdfrome021 64
Fractional change: 0.2588939230105063 +- 0.016541549294602324
Fourth Iteration
*** offnodes job count:
sdfrome042 48
sdfrome043 14
sdfrome111 1
sdfrome039 27
sdfrome086 10
*** onnodes job count:
sdfrome016 100
Fractional change: 0.2359417044882193 +- 0.015870310667490246
Update 2024-09-15
We repeated the test on roma partition, 105 iterations each with constraint Crowdstrike_on/Crowdstrike_off alternating. This test was performed during a period of low utilization of the rome partition with no competing network or storage contention.
Measured runtime for psana analysis of mfxl1028222 run=29:smd on exclusive node with 120 cores.
Note: the previous measurements were done with run=90:smd. We chose run=29:smd, because it has more events and therefore takes longer, minimizing effects related to job startup.
Fractional change: 0.24461288024797354 +- 0.001079561972505891