Page History

...

Code Block

theme	Confluence
language	python
title	gather_savegatherSave.py
linenumbers	true
collapse	true

import sys
import numpy as np
import h5py
from collections import defaultdict
import psana


def getFirstEpicsValue(epics,pvName):
    '''convenience function to get first epics value from the given pv
    '''
    return epics.getPV(pvName).data()[0]

def gatherData(eventsToProcessmaxEventsToProcess=None):
    '''Goes through a specific experiment and run, 
    gathers specific values from the data,
    returns a two numpy arrays:
       compoundDatadataArray: the gathered data
       compoundTimetimeArray: the event times for each row of the gathered data
                  has two fields - 'seconds' 'nanoseconds', both integers
    ds'''
    dataSource = psana.DataSource("exp=cxitut13:run=0022")
    src_diodesrcDiode = psana.Source('DetInfo(CxiDg4.0:Ipimb.0)')
    src_gasdetsrcGasDet = psana.Source('BldInfo(FEEGasDetEnergy)')
    epicsepicsStore = dsdataSource.env().epicsStore()

    dataLists# = defaultdict(list)
    for num,evt in enumerate(ds.events()):define data type for data array and time array
    dataDtype = np.dtype([('aG',np.float),  if eventsToProcess is not None and num >= eventsToProcess:
('aD',np.float), ('aH',np.float),  
                break
        diode = evt.get(psana.Ipimb.DataV2,src_diode) ('aX',np.float), ('aY',np.float), ('aZ',np.float)])

    timeDtype    dataLists['aDiode'].append(diode.channel0Volts()+diode.channel1Volts()+= np.dtype([('seconds',np.uint32),('nanoseconds',np.uint32)])

    # to minimize realloc of arrays, set the number of elements in a  block size
    blockSize = 100000
    dataArray = np.zeros(blockSize, dtype=dataDtype)
    timeArray = diodenp.channel2Volts()+diode.channel3Volts())
zeros(blockSize, dtype=timeDtype)

    # motor calibration
   GasDet EncoderScale = evt.get(psana.Bld.BldDataFEEGasDetEnergy,src_gasdet)
  0.005 
    Xoffset = 17050.
    Yoffset = 1056510.
    Zoffset  dataLists['aGasDet'].append(GasDet.f_11_ENRC())= 1830400.
    
    eventIdnextArrayRow = evt.get(psana.EventId)0
    for eventNumber, evt  dataLists['aTime'].append(eventId.timein enumerate(dataSource.events()):
        if maxEventsToProcess is not None and nextArrayRow  dataLists['aSampleX'].append(getFirstEpicsValue(epics,'CXI:SC1:MZM:08:ENCPOSITIONGET'))>= maxEventsToProcess:
        dataLists['aSampleY'].append(getFirstEpicsValue(epics,'CXI:SC1:MZM:09:ENCPOSITIONGET'))
    break
        diode  dataLists['aSampleZ'].append(getFirstEpicsValue(epics,'CXI:SC1:MZM:10:ENCPOSITIONGET')= evt.get(psana.Ipimb.DataV2,srcDiode)
        dataLists['aKB1Hpitch'].append(getFirstEpicsValue(epics,'CXI:KB1:MMS:07.RBV'))gasDet = evt.get(psana.Bld.BldDataFEEGasDetEnergy,srcGasDet)
        if# check numthat %all 1000data == 0:
      is present in event before adding to dataArray
      print "event number=%8dif diode.channel0Volts=%f GasDet.f_11_ENCR=%f" % \
 is None or gasDet is None:
            continue
        (num, diode.channel0Volts(),GasDet.f_11_ENRC())

    # motor calibration
    EncoderScale = 0.005 
    Xoffset = 17050.# grow arrays if they are to small
        if nextArrayRow >= timeArray.size:
            timeArray.resize(timeArray.size + blockSize)
    Yoffset = 1056510.
    Zoffset = 1830400.

dataArray.resize(dataArray.size + blockSize)
        compoundDataTypetimeArray[nextArrayRow] = npevt.dtype([('aG',np.float), ('aD',np.float), ('aH',np.float),  get(psana.EventId).time()
        diodeSum = diode.channel0Volts() + diode.channel1Volts() + \
                   ('aX',np.float), ('aY',np.float), ('aZ',np.float)])

diode.channel2Volts() + diode.channel3Volts()
      compoundData = np.zeros(len(dataLists dataArray[nextArrayRow]['aTimeaD']), dtype=compoundDataType)

 diodeSum
        compoundDatadataArray[nextArrayRow]['aG'] = dataLists['aGasDet']
gasDet.f_11_ENRC()
        sampleX = epicsStore.getPV('CXI:SC1:MZM:08:ENCPOSITIONGET').value(0)
        compoundData['aD']sampleY = dataLists['aDiode']
epicsStore.getPV('CXI:SC1:MZM:09:ENCPOSITIONGET').value(0)
       compoundData['aH'] sampleZ = dataLists['aKB1Hpitch']epicsStore.getPV('CXI:SC1:MZM:10:ENCPOSITIONGET').value(0)
    compoundData    dataArray[nextArrayRow]['aX'] = np.array(dataLists['aSampleX'])sampleX * EncoderScale - Xoffset
    compoundData    dataArray[nextArrayRow]['aY'] = np.array(dataLists['aSampleY'])sampleY * EncoderScale - Yoffset
     compoundData   dataArray[nextArrayRow]['aZ'] = np.array(dataLists['aSampleZ'])sampleZ * EncoderScale - Zoffset

      timeDataType = np.dtype([('seconds',np.uint32),('nanoseconds',np.uint32)])
    compoundTime = np.zeros(len(dataLists['aTime']), dtype=timeDataType)
    compoundTime[:] = dataLists['aTime'] dataArray[nextArrayRow]['aH'] = \
                             epicsStore.getPV('CXI:KB1:MMS:07.RBV').value(0)
        if nextArrayRow % 1000 == 0:
            print "event %8d diode.channel0Volts=%f gasDet.f_11_ENCR=%f" % \
                (eventNumber, diode.channel0Volts(),gasDet.f_11_ENRC())
        nextArrayRow += 1

    # shrink arrays to number of events we stored data from
    timeArray.resize(nextArrayRow)
    dataArray.resize(nextArrayRow)

    return compoundDatadataArray, compoundTimetimeArray

def writeCompoundDataToH5H5WriteDataAndTime(compoundDatadataArray, compoundTimetimeArray, h5filename):
    f = h5py.File(h5filename,'w')
    data_comp_typedataDtype = compoundDatadataArray.dtype
    data_dsetdataDset = f.create_dataset('data', compoundDatadataArray.shape, data_comp_typedataDtype)
    data_dsetdataDset[:] = compoundDatadataArray
    time_comp_typetimeDtype = compoundTimetimeArray.dtype
    time_dsettimeDset = f.create_dataset('time',compoundTimetimeArray.shape, time_comp_typetimeDtype)
    time_dsettimeDset[:] = compoundTimetimeArray
    f.close()

def readCompoundDataFromH5H5ReadDataAndTime(h5filename):
    f = h5py.File(h5filename)
    data_dsetdataDset = f['data']
    data_arraydataArray = data_dsetdataDset[:]
    time_dsettimeDset = f['time']
    time_arraytimeArray = time_dsettimeDset[:]
    f.close()
    return data_arraydataArray, time_arraytimeArray

if __name__ == '__main__':
    maxArraySize = None
    if len(sys.argv) > 1:
        maxArraySize = int(sys.argv[1])
    datadataArray,timePairtimeArray = gatherData(maxArraySize)
    writeCompoundDataToH5H5WriteDataAndTime(datadataArray,timePair timeArray, "saved_output.h5")
    datadataArray,timePairtimeArray = readCompoundDataFromH5H5ReadDataAndTime("saved_output.h5")

To use this script:

place it in your release directory
run ipython

enter the commands:

import gather_savegatherSave

datadataArray,timePairtimeArray = gather_savegatherSave.gatherData()

gather_savegatherSave.writeCompoundDataToH5H5WriteDataAndTime(datadataArray,timePair timeArray, "saved_output.h5")

datadataArray,timePairtimeArray = gather_save.readCompoundDataFromH5H5ReadDataAndTime("saved_output.h5")

The function gatherData() is one that needs to be modified for different datasets. writeCompoundData H5WriteDataAndTime and readCompoundData H5ReadDataAndTime will not.

data dataArray is a numpy array with 6 named fields that gather different values from the events, epics pv's , a value from the gas detector, and the voltage sum of a Diode. The fields have names like 'aD' (the Diode sum) and 'aG' for the gas detector value.

...

Code Block

language	python

logicalIndex = datadataArray['aG'] > 0.84   # a mask that is 1 when 'aG' is greater than 0.84
data['aD'][logicalIndex]                # the mask is used to get diode values when 'aG' is > than 0.84
logicalIndex.nonzero()[0]               # turn the mask into a list of positions, see the documentation on 
                                        # the numpy function nonzero 
                  # http://docs.scipy.org/doc/numpy/reference/generated/numpy.nonzero.html

# likewise one can do
import numpy as np
np.where(data['aG'] > 0.84)[0]     # the [0] is necessary to get the indicies along the first axis

...

Below we discuss how things are done in the example.

...

Creation of Numpy Array with Named Fields

Numpy arrays are very efficient data structures in Python. This example creates two numpy arrays to store the event data. These arrays have named fields which provides a dictionary style access to the data. Note the two numpy dtypes (data types) created that define these arrays:

Code Block

language	python

from collections import defaultdict
...
    dataLists = defaultdict(list)
    for num,evt in enumerate(ds.events()):
    ...    # define data type for data array and time array
    dataDtype = np.dtype([('aG',np.float), ('aD',np.float), ('aH',np.float),  
                          ('aX',np.float), ('aY',np.float), ('aZ',np.float)])
    timeDtype    dataLists['aTime'].append(eventId.time())

The use of the standard Python defaultdict allows us to avoid initializing the keys in dataList. The price is that if we later make a typo when we retrieve 'aTime', from dataLists, the error message will not make this clear.

Creation of Numpy Array with Named Fields

= np.dtype([('seconds',np.uint32),('nanoseconds',np.uint32)])

A 32 bit unsigned int is used for the seconds and nanoseconds for the timeArray as this is how these fields are stored in the xtc, but one could use np.int or np.uint as well.

For more information on numpy dtypes visit the documentation: Numpy Dtypes

Checking that all Data is Present in the Event

One needs to check that the data one wants from an event is present. In the example there are two counters - eventNumber and nextArrayRow. eventCounter keeps track of which event we are reading and is only used for printing a status message. nextArrayRow is a zero-up counter of events that include both the diode and gasDet data. If both are not present - we go on to the next event and do not increment nextArrayRow.

Using Array Blocks to Read in Data

Manipulations using numpy arrays are most efficient when the final size of the array is known ahead of time. Although numpy arrays have an append method, using it for each event can When creating numpy arrays, it is more efficient to create with a known size. You can append to an existing numpy array, but to do this with every event may lead to a great deal of memory reallocation. In this example, we read the data into Python lists. Once we have all the data, we create the numpy array of the known size.To create a numpy array with named fields you must define a dtype. For this example where each field is a float, it is fairly straightforward's of the arrays. Therefore we start with arrays that hold 100,000 elements, and grow them by 100,000 if we need to. In the end we shrink the arrays down to the size that is used. Note the lines:

Code Block

language	python

import numpy as np
... 
compoundDataType    # to minimize realloc of arrays, set the number of elements in a  block size
    blockSize = 100000
    dataArray = np.dtype([('aG',np.float), ('aD',np.float), ('aH',np.float),zeros(blockSize, dtype=dataDtype)
    timeArray = np.zeros(blockSize, dtype=timeDtype)
    ...
        # grow arrays if they are to small
        if nextArrayRow >= timeArray.size:
      ('aX',np.float), ('aY',np.float), ('aZ',np.float)])
 
compoundData = np.zeros(len(dataLists['aTime']), dtype=compoundDataType)       timeArray.resize(timeArray.size + blockSize)
            dataArray.resize(dataArray.size + blockSize)
    ...
    # shrink arrays to number of events we stored data from
    timeArray.resize(nextArrayRow)
    dataArray.resize(nextArrayRow)

numpy dtypes can get quite complicated, for more information visit the documentation: http://docs.scipy.org/doc/numpy/reference/arrays.dtypes.htmlWri

Writing an HDF5 File of Compound Data

...

Page tree

Versions Compared

Old Version 6

New Version 7

Key

Creation of Numpy Array with Named Fields

Creation of Numpy Array with Named Fields

Checking that all Data is Present in the Event

Using Array Blocks to Read in Data

Writing an HDF5 File of Compound Data