Source code for sherpa.io

#
#  Copyright (C) 2007, 2015, 2016, 2019 - 2021, 2023, 2024
#  Smithsonian Astrophysical Observatory
#
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#

"""I/O routines for Sherpa.

These routines are currently restricted to reading from ASCII files.
"""

import os
from typing import Optional, Sequence

import numpy as np

from sherpa.data import Data, Data1D
from sherpa.utils import is_subclass, get_num_args, is_binary_file
from sherpa.utils.err import IOErr
from sherpa.utils.numeric_types import SherpaFloat
from sherpa.utils.types import ArrayType


__all__ = ('read_data', 'write_data', 'get_ascii_data', 'read_arrays',
           'write_arrays')


NamesType = Sequence[str]


def _check_args(size: int, dstype) -> None:
    # Find the number of required args minus self, filename
    req_args = get_num_args(dstype.__init__)[1] - 2
    if size >= req_args:
        return

    raise TypeError(f"data set '{dstype.__name__}' takes at "
                    f"least {req_args} args")


def read_file_data(filename: str,
                   sep: str = ' ',
                   comment: str = '#',
                   require_floats: bool = True
                   ) -> tuple[list[str], list[np.ndarray]]:
    """Read in column data from a file."""

    bad_chars = '\t\n\r,;: |'
    raw_names = []
    rows = []

    ncols = None
    with open(filename, 'r', encoding="utf-8") as fh:
        for line in fh:
            for char in bad_chars:
                if char in line:
                    # replace any bad chars in line with sep for tokenize
                    line = line.replace(char, sep)

            line = line.strip()

            # look for last commented line before data
            if len(line) > 0 and line[0] == comment:
                # Slice off the comment
                # TODO: why is this not just `line = line[1:]`?
                line = line.replace(comment, ' ')
                raw_names = line.strip().split(sep)

            elif line == '':
                continue
            else:
                # split line at sep
                elems = line.strip().split(sep)
                row = [elem for elem in elems if elem != '']

                # make list of row elements
                rows.append(row)

                if ncols is None:
                    ncols = len(row)
                elif ncols != len(row):
                    raise IOErr('arraysnoteq')

    if ncols is None:
        raise IOErr(f"No column data found in {filename}")

    # rotate rows into list of columns
    cols = np.column_stack(rows)

    # cast columns to appropriate type
    args = []
    for col in cols:
        try:
            args.append(col.astype(SherpaFloat))
        except ValueError as ve:
            if require_floats:
                raise ValueError(f"The file {filename} could not "
                                 "be loaded, probably because it "
                                 "contained spurious data and/or "
                                 "strings") from ve
            args.append(col)

    names = [name.strip(bad_chars)
             for name in raw_names if name != '']
    nargs = len(args)
    # TODO: should this error out if nargs == 0?

    if len(names) == 0:
        names = [f'col{i}' for i in range(1, nargs + 1)]

    # TODO: This could error out if len(names) > nargs, but this might
    # break existing code, since there is such a check in
    # get_ascii_data but it's only triggered when the colkeys argument
    # is set. To avoid breaking code we leave as is for now.
    #
    return names, args


def get_column_data(*args) -> list[Optional[np.ndarray]]:
    """
    get_column_data( *NumPy_args )
    """
    if len(args) == 0:
        raise IOErr('noarrays')

    cols = []
    for arg in args:
        if arg is None or isinstance(arg, (np.ndarray, list, tuple)):
            vals = arg
        else:
            raise IOErr('badarray', arg)

        if arg is not None:
            vals = np.asanyarray(vals)
            for col in np.atleast_2d(vals.T):
                cols.append(col)
        else:
            cols.append(vals)

    return cols



[docs]
def get_ascii_data(filename: str,
                   ncols: int = 1,
                   colkeys: Optional[NamesType] = None,
                   sep: str = ' ',
                   dstype: type = Data1D,
                   comment: str = '#',
                   require_floats: bool = True
                   ) -> tuple[list[str], list[np.ndarray], str]:
    r"""Read in columns from an ASCII file.

    Parameters
    ----------
    filename : str
       The name of the ASCII file to read in.
    ncols : int, optional
       The number of columns to read in (the first ``ncols`` columns
       in the file). This is ignored if ``colkeys`` is given.
    colkeys : array of str, optional
       An array of the column name to read in. The default is
       `None`.
    sep : str, optional
       The separator character. The default is ``' '``.
    dstype : data class to use, optional
       Used to check that the data file contains enough columns.
    comment : str, optional
       The comment character. The default is ``'#'``.
    require_floats : bool, optional
       If `True` (the default), non-numeric data values will
       raise a `ValueError`.

    Returns
    -------
    (colnames, coldata, filename)
       The column names read in, the data for the columns
       as an array, with each element being the data for the column
       (the order matches ``colnames``), and the name of the file.

    Raises
    ------
    sherpa.utils.err.IOErr
       Raised if a requested column is missing or the file appears
       to be a binary file.
    ValueError
       If a column value can not be converted into a numeric value
       and the `require_floats` parameter is `True`.

    See Also
    --------
    read_arrays, read_data, write_arrays, write_data

    Notes
    -----
    The file is processed by reading in each line, stripping out any
    unsupported characters (replacing them by the ``sep`` argument),
    skipping empty lines, and then identifying comment and data lines.

    The list of unsupported characters are: ``\t``, ``\n``,
    ``\r``, comma, semi-colon, colon, space, and ``|``.

    The last comment line before the data is used to define the
    column names, splitting the line by the ``sep`` argument.
    If there are no comment lines then the columns are named
    starting at ``col1``, ``col2``, up to the number of columns.

    Data lines are separated into columns - splitting by the
    ``sep`` comment - and then converted to NumPy arrays.
    If the ``require_floats`` argument is `True` then the
    column will be converted to a floating-point number
    type, with an error raised if this fails.

    An error is raised if the number of columns per row
    is not constant.

    If the ``colkeys`` argument is used then a case-sensitive
    match is used to determine what columns to return.

    Examples
    --------

    Read in the first column from the file:

    >>> (colnames, coldata, fname) = get_ascii_data('src.dat')

    Read in the first three columns from the file:

    >>> colinfo = get_ascii_data('src.dat', ncols=3)

    Read in a histogram data set, using the columns XLO, XHI,
    and Y:

    >>> cols = ['XLO', 'XHI', 'Y']
    >>> res = get_ascii_data('hist.dat', colkeys=cols,
                             dstype=sherpa.data.Data1DInt)

    Read in the first and third column from the file cols.dat,
    where the file has no header information:

    >>> res = get_ascii_data('cols.dat', colkeys=['col1', 'col3'])

    """

    if is_binary_file(filename):
        raise IOErr('notascii', filename)

    names, args = read_file_data(filename, sep=sep, comment=comment,
                                 require_floats=require_floats)

    if colkeys is None:
        kwargs = []
        if ncols != 1:
            _check_args(ncols, dstype)
        kwargs.extend(args[:ncols])
        return (names, kwargs, filename)

    kwargs = []
    colkeys = list(colkeys)

    nnames = len(names)
    nargs = len(args)
    if nnames > nargs:
        raise IOErr('wrongnumcols', nargs, nnames)

    for key in colkeys:
        if key not in names:
            raise IOErr('reqcol', key, names)
        kwargs.append(args[names.index(key)])

    _check_args(len(kwargs), dstype)
    return (colkeys, kwargs, filename)




[docs]
def read_data(filename: str,
              ncols: int = 2,
              colkeys: Optional[NamesType] = None,
              sep: str = ' ',
              dstype=Data1D,
              comment: str = '#',
              require_floats: bool = True) -> Data:
    """Create a data object from an ASCII file.

    Parameters
    ----------
    filename : str
       The name of the ASCII file to read in.
    ncols : int, optional
       The number of columns to read in (the first ``ncols`` columns
       in the file). This is ignored if `colkeys` is given.
    colkeys : array of str, optional
       An array of the column name to read in. The default is
       `None`.
    sep : str, optional
       The separator character. The default is ``' '``.
    dstype : data class to use, optional
       The class of the data object to create.
    comment : str, optional
       The comment character. The default is ``'#'``.
    require_floats : bool, optional
       If `True` (the default), non-numeric data values will
       raise a `ValueError`.

    Returns
    -------
    data
       The data object (created by calling the dstype constructor
       with the filename and then the data columns from the file).

    Raises
    ------
    sherpa.utils.err.IOErr
       Raised if a requested column is missing or the file appears
       to be a binary file.
    ValueError
       If a column value can not be converted into a numeric value
       and the `require_floats` parameter is True.

    See Also
    --------
    get_ascii_data, read_arrays, write_data

    Notes
    -----

    The file format is described in `get_ascii_data`.

    Examples
    --------

    Create a 1D data object from the first two columns in the file:

    >>> dat = read_data('src.dat')

    Use the third column as the error column (statistical):

    >>> dat = read_data('src.dat', ncols=3)

    Read in a histogram data set, using the columns XLO, XHI,
    and Y:

    >>> cols = ['XLO', 'XHI', 'Y']
    >>> dat = read_data('hist.dat', colkeys=cols,
                        dstype=sherpa.data.Data1DInt)

    Use the first and third column from the file cols.dat,
    where the file has no header information:

    >>> dat = read_data('cols.dat', colkeys=['col1', 'col3'])

    """

    _, args, name = get_ascii_data(filename, ncols=ncols,
                                   colkeys=colkeys, sep=sep,
                                   dstype=dstype, comment=comment,
                                   require_floats=require_floats)
    return dstype(name, *args)




[docs]
def read_arrays(*args) -> Data:
    """Create a data object from arrays.

    Parameters
    ----------
    col1, ... coln : array_like
       The data columns.
    dstype : optional
       The data type to create. It must be a subclass of
       `sherpa.data.Data` and defaults to `sherpa.data.Data1D`

    Returns
    -------
    data
       The data object (created by calling the dstype constructor
       with the filename and then the data columns from the file).

    Raises
    ------
    sherpa.utils.err.IOErr
       Raised if no arrays are sent in.

    See Also
    --------
    get_ascii_data, write_arrays

    Examples
    --------

    Create a 1D data object from the x and y arrays:

    >>> dat = read_arrays(x, y)

    Include a statistical error column:

    >>> dat = read_arrays(x, y, dy)

    Create an integrated (i.e. histogram) data set:

    >>> dat = read_arrays(xlo, xhi, y, dstype=sherpa.data.Data1DInt)

    """
    largs = list(args)
    if len(largs) == 0:
        raise IOErr('noarrays')

    if is_subclass(largs[-1], Data):
        dstype = largs.pop()
    else:
        dstype = Data1D

    dargs = get_column_data(*largs)

    # Determine max number of args for dataset constructor
    _check_args(len(dargs), dstype)

    return dstype('', *dargs)




[docs]
def write_arrays(filename: str,
                 args: Sequence[ArrayType],
                 fields: Optional[NamesType] = None,
                 sep: str = ' ',
                 comment: str = '#',
                 clobber: bool = False,
                 linebreak: str = '\n',
                 format: str = '%g') -> None:
    """Write a list of arrays to an ASCII file.

    Parameters
    ----------
    filename : str
       The name of the file to write the array to.
    args : array_like
       The arrays to write out.
    fields : array_like of str
       The column names (should match the size of `args` if given).
    sep : str, optional
       The separator character. The default is ``' '``.
    comment : str, optional
       The comment character. The default is ``'#'``. This is only used
       to write out the column names when `fields` is not `None`.
    clobber : bool, optional
       If `filename` is not `None`, then this flag controls
       whether an existing file can be overwritten (``True``)
       or if it raises an exception (``False``, the default
       setting).
    linebreak : str, optional
       Indicate a new line. The default is ``'\\n'``.
    format : str, optional
       The format used to write out the numeric values. The
       default is ``'%g%'``.

    Raises
    ------
    sherpa.utils.err.IOErr
       If `filename` already exists and `clobber` is `False`
       or if there is no data to write.

    See Also
    --------
    get_ascii_data

    Examples
    --------

    Write the x and y arrays to the file 'src.dat':

    >>> write_arrays('src.dat', [x, y])

    Use the column names "r" and "surbri" for the columns:

    >>> write_arrays('prof.txt', [x, y], fields=["r", "surbri"],
                     clobber=True)

    """
    if os.path.isfile(filename) and not clobber:
        raise IOErr("filefound", filename)

    # We assume the values are numeric but we never test for this
    # explicitly, nor do we require it in the types. This can make the
    # typing code get confused about what is allowed.
    #
    # In numpy 1.24 it became an error to pass in irregularly-gridded
    # data to asarray. Prior to that it would return an ndarray with a
    # dtype of object (and generate a deprecation warning).
    #
    narg = set()
    try:
        for arg in args:
            try:
                narg.add(len(arg))
            except TypeError:
                # len(arg) fails, so assume a scalar.
                narg.add(0)

    except TypeError:
        # args is not iterable, in which case narg will be empty and
        # caught below
        pass

    # Allow args to be a sequence of non-sequences or of sequences of
    # the same size. The former is technically not in the spirit of
    # the call but users may be taking advantage of it so do not error
    # out.
    #
    if len(narg) == 0 or 0 in narg:
        raise IOErr('noarrayswrite')

    if len(narg) != 1:
        raise IOErr('arraysnoteq')

    cols = np.column_stack(np.asarray(args))
    if cols.ndim < 2:
        raise IOErr('noarrayswrite')

    lines = []
    for col in cols:
        line = [format % elem for elem in col]
        lines.append(sep.join(line))

    with open(filename, 'w', encoding="utf-8") as fh:

        if fields is not None:
            fh.write(comment + sep.join(fields) + linebreak)

        fh.write(linebreak.join(lines))

        # add a newline at end
        fh.write(linebreak)




[docs]
def write_data(filename: str,
               dataset: Data,
               fields: Optional[NamesType] = None,
               sep: str = ' ',
               comment: str = '#',
               clobber: bool = False,
               linebreak: str = '\n',
               format: str = '%g'
               ) -> None:
    """Write out a dataset as an ASCII file.

    Parameters
    ----------
    filename : str
       The name of the file to write the array to.
    dataset :
       The data object to write out.
    fields : array_like of str
       The column names (should match the size of ``args`` if given).
       Any unknown columns are skipped. If not given then the field
       names from the data set will be used (for those columns which
       contain data).
    sep : str, optional
       The separator character. The default is ``' '``.
    comment : str, optional
       The comment character. The default is ``'#'``. This is used to
       write out the column names (after converting to upper case)
       before the data.
    clobber : bool, optional
       If `filename` is not `None`, then this flag controls
       whether an existing file can be overwritten (`True`)
       or if it raises an exception (`False`, the default
       setting).
    linebreak : str, optional
       Indicate a new line. The default is ``'\\n'``.
    format : str, optional
       The format used to write out the numeric values. The
       default is ``'%g%'``.

    Raises
    ------
    sherpa.utils.err.IOErr
       If `filename` already exists and `clobber` is `False`
       or if there is no data to write.

    See Also
    --------
    get_ascii_data, read_data

    Examples
    --------

    Write the x and y arrays to the file 'src.dat':

    >>> write_data('src.dat', dat)

    """

    if fields is None:
        fields = dataset._fields

    cols = []
    col_names = []

    for name in fields:
        field = getattr(dataset, name, None)
        if field is not None and name != 'name':
            col_names.append(name.upper())
            cols.append(field)

    write_arrays(filename, cols, fields=col_names, sep=sep,
                 comment=comment, clobber=clobber,
                 linebreak=linebreak, format=format)