...
We use 396 cores on 9 srcf nodes (32 eventbuilder cores) to keep up with all the eight xtc2 files. Below shows python and bash/job scripts for slurm.
Code Block |
---|
language | py |
---|
title | test_live.py |
---|
|
import time
import os,sys
from psana import DataSource
import numpy as np
import vals
from mpi4py import MPI
comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()
def test_standard():
batch_size = 1000
max_events = 0
hutch='tst'
exp=sys.argv[1]
runno=int(sys.argv[2])
xtc_dir=f'/cds/data/drpsrcf/{hutch}/{exp}/xtc/'
ds = DataSource(exp=exp,
run=runno,
batch_size=batch_size,
max_events=max_events,
dir=xtc_dir,
live=True
)
sendbuf = np.zeros(1, dtype='i')
recvbuf = None
if rank == 0:
recvbuf = np.empty([size, 1], dtype='i')
st = time.time()
for run in ds.runs():
for nevt, evt in enumerate(run.events()):
if nevt % 1000 == 0 and nevt > 0:
en = time.time()
print(f'RANK: {rank:4d} EVENTS: {nevt:10d} RATE: {(1000/(en-st))*1e-3:.2f}kHz', flush=True)
st = time.time()
sendbuf += 1
# Count total no. of events
comm.Gather(sendbuf, recvbuf, root=0)
if rank == 0:
n_events = np.sum(recvbuf)
else:
n_events = None
n_events = comm.bcast(n_events, root=0)
return n_events
if __name__ == "__main__":
comm.Barrier()
t0 = MPI.Wtime()
n_events = test_standard()
comm.Barrier()
t1 = MPI.Wtime()
if rank == 0:
n_eb_nodes = int(os.environ.get('PS_EB_NODES', '1'))
print(f'TOTAL TIME:{t1-t0:.2f}s #EB: {n_eb_nodes:3d} EVENTS:{n_events:10d} RATE:{(n_events/(t1-t0))*1e-6:.2f}MHz', flush=True)
|
Code Block |
---|
language | bash |
---|
title | submit_slac.sh |
---|
|
#!/bin/bash
#SBATCH --partition=anaq
#SBATCH --job-name=psana2
#SBATCH --nodes=9
#SBATCH --ntasks=396
##SBATCH --ntasks-per-node=50
#SBATCH --output=%j.log
#SBATCH --exclusive
t_start=`date +%s`
source setup_hosts.sh
echo SLURM_HOSTFILE $SLURM_HOSTFILE SLURM_NTASKS $SLURM_NTASKS
export PS_EB_NODES=32
MAX_EVENTS=0
EXP="tstx00817"
RUNNO=55
srun ./run_slac.sh $MAX_EVENTS $EXP $RUNNO
t_end=`date +%s`
echo PSJobCompleted TotalElapsed $((t_end-t_start)) |
Code Block |
---|
|
*** /cds/home/m/monarin/lcls2/install/include/xtcdata/xtc/ShapesData.hh:355: incorrect TypeId 0
[drp-srcf-cmp048:209412] *** Process received signal ***
[drp-srcf-cmp048:209412] Signal: Aborted (6)
[drp-srcf-cmp048:209412] Signal code: (-6) |