Source code for pwkit.tabfile

# -*- mode: python; coding: utf-8 -*-
# Copyright 2013-2014 Peter Williams <peter@newton.cx> and collaborators.
# Licensed under the MIT License.

"""pwkit.tabfile - I/O with typed tables of uncertain measurements.

Functions:

read    - Read a typed table file.
vizread - Read a headerless table file, with columns specified separately
write   - Write a typed table file.

The table format is line-oriented text. Hashes denote comments. Initial lines
of the form "colname = value" set a column name that gets the same value for
every item in the table. The header line is prefixed with an @ sign.
Subsequent lines are data rows.

"""
from __future__ import absolute_import, division, print_function, unicode_literals

__all__ = str("read vizread write").split()

from . import Holder, PKError, io, msmt, reraise_context


def _getparser(lname):
    a = lname.rsplit(":", 1)
    if len(a) == 1:
        a.append("s")
    return a[0], msmt.parsers[a[1]]


def _trimmedlines(path, **kwargs):
    for line in io.pathlines(path, **kwargs):
        line = line[:-1]  # trailing newline
        line = line.split("#", 1)[0]
        if not len(line):
            continue
        if line.isspace():
            continue
        yield line



[docs]
def read(path, tabwidth=8, **kwargs):
    """Read a typed tabular text file into a stream of Holders.

    Arguments:

    path
      The path of the file to read.
    tabwidth=8
      The tab width to assume. Please don't monkey with it.
    mode='rt'
      The file open mode (passed to io.open()).
    noexistok=False
      If True and the file is missing, treat it as empty.
    ``**kwargs``
      Passed to io.open ().

    Returns a generator for a stream of `pwkit.Holder`s, each of which will
    contain ints, strings, or some kind of measurement (cf `pwkit.msmt`).

    """
    datamode = False
    fixedcols = {}

    for text in _trimmedlines(path, **kwargs):
        text = text.expandtabs(tabwidth)

        if datamode:
            # table row
            h = Holder()
            h.set(**fixedcols)
            for name, cslice, parser in info:
                try:
                    v = parser(text[cslice].strip())
                except:
                    reraise_context('while parsing "%s"', text[cslice].strip())
                h.set_one(name, v)
            yield h
        elif text[0] != "@":
            # fixed column
            padnamekind, padval = text.split("=", 1)
            name, parser = _getparser(padnamekind.strip())
            fixedcols[name] = parser(padval.strip())
        else:
            # column specification
            n = len(text)
            assert n > 1
            start = 0
            info = []

            while start < n:
                end = start + 1
                while end < n and (not text[end].isspace()):
                    end += 1

                if start == 0:
                    namekind = text[start + 1 : end]  # eat leading @
                else:
                    namekind = text[start:end]

                while end < n and text[end].isspace():
                    end += 1

                name, parser = _getparser(namekind)
                if parser is None:  # allow columns to be ignored
                    skippedlast = True
                else:
                    skippedlast = False
                    info.append((name, slice(start, end), parser))
                start = end

            datamode = True

            if not skippedlast:
                # make our last column go as long as the line goes
                # (e.g. for "comments" columns)
                # but if the real last column is ":x"-type, then info[-1]
                # doesn't run up to the end of the line, so do nothing in that case.
                lname, lslice, lparser = info[-1]
                info[-1] = lname, slice(lslice.start, None), lparser



def _tabpad(text, width, tabwidth=8):
    # note: assumes we're starting tab-aligned
    l = len(text)
    assert l <= width

    if l == width:
        return text

    n = width - l
    ntab = n // tabwidth
    nsp = n - ntab * tabwidth
    return "".join((text, " " * nsp, "\t" * ntab))



[docs]
def write(stream, items, fieldnames, tabwidth=8):
    """Write a typed tabular text file to the specified stream.

    Arguments:

    stream
      The destination stream.
    items
      An iterable of items to write. Two passes have to
      be made over the items (to discover the needed column widths),
      so this will be saved into a list.
    fieldnames
      Either a list of field name strings, or a single string.
      If the latter, it will be split into a list with .split().
    tabwidth=8
      The tab width to use. Please don't monkey with it.

    Returns nothing.

    """
    if isinstance(fieldnames, str):
        fieldnames = fieldnames.split()

    maxlens = [0] * len(fieldnames)

    # We have to make two passes, so listify:
    items = list(items)

    # pass 1: get types and maximum lengths for each record. Pad by 1 to
    # ensure there's at least one space between all columns.

    coltypes = [None] * len(fieldnames)

    for i in items:
        for idx, fn in enumerate(fieldnames):
            val = i.get(fn)
            if val is None:
                continue

            typetag, text, inexact = msmt.fmtinfo(val)
            maxlens[idx] = max(maxlens[idx], len(text) + 1)

            if coltypes[idx] is None:
                coltypes[idx] = typetag
                continue

            if coltypes[idx] == typetag:
                continue

            if coltypes[idx][-1] == "f" and typetag[-1] == "u":
                # Can upcast floats to uvals
                if coltypes[idx][:-1] == typetag[:-1]:
                    coltypes[idx] = coltypes[idx][:-1] + "u"
                    continue

            if coltypes[idx][-1] == "u" and typetag[-1] == "f":
                if coltypes[idx][:-1] == typetag[:-1]:
                    continue

            raise PKError(
                "irreconcilable column types: %s and %s", coltypes[idx], typetag
            )

    # Compute column headers and their widths

    headers = list(fieldnames)
    headers[0] = "@" + headers[0]

    for idx, fn in enumerate(fieldnames):
        if coltypes[idx] != "":
            headers[idx] += ":" + coltypes[idx]

        maxlens[idx] = max(maxlens[idx], len(headers[idx]))

    widths = [tabwidth * ((k + tabwidth - 1) // tabwidth) for k in maxlens]

    # pass 2: write out

    print(
        "".join(_tabpad(h, widths[idx], tabwidth) for (idx, h) in enumerate(headers)),
        file=stream,
    )

    def ustr(i, f):
        v = i.get(f)
        if v is None:
            return ""
        return msmt.fmtinfo(v)[1]

    for i in items:
        print(
            "".join(
                _tabpad(ustr(i, fn), widths[idx], tabwidth)
                for (idx, fn) in enumerate(fieldnames)
            ),
            file=stream,
        )




[docs]
def vizread(descpath, descsection, tabpath, tabwidth=8, **kwargs):
    """Read a headerless tabular text file into a stream of Holders.

    Arguments:

    descpath
      The path of the table description ini file.
    descsection
      The section in the description file to use.
    tabpath
      The path to the actual table data.
    tabwidth=8
      The tab width to assume. Please don't monkey with it.
    mode='rt'
      The table file open mode (passed to io.open()).
    noexistok=False
      If True and the file is missing, treat it as empty.
    ``**kwargs``
      Passed to io.open ().

    Returns a generator of a stream of `pwkit.Holder`s, each of which will
    contain ints, strings, or some kind of measurement (cf `pwkit.msmt`). In
    this version, the table file does not contain a header, as seen in Vizier
    data files. The corresponding section in the description ini file has keys
    of the form "colname = <start> <end> [type]", where <start> and <end> are
    the **1-based** character numbers defining the column, and [type] is an
    optional specified of the measurement type of the column (one of the usual
    b, i, f, u, Lu, Pu).

    """
    from .inifile import read as iniread

    cols = []

    for i in iniread(descpath):
        if i.section != descsection:
            continue

        for field, desc in i.__dict__.items():
            if field == "section":
                continue

            a = desc.split()
            idx0 = int(a[0]) - 1

            if len(a) == 1:
                cols.append((field, slice(idx0, idx0 + 1), msmt.parsers["s"]))
                continue

            if len(a) == 2:
                parser = msmt.parsers["s"]
            else:
                parser = msmt.parsers[a[2]]

            cols.append((field, slice(idx0, int(a[1])), parser))

    for text in _trimmedlines(tabpath, **kwargs):
        text = text.expandtabs(tabwidth)

        h = Holder()
        for name, cslice, parser in cols:
            try:
                v = parser(text[cslice].strip())
            except:
                reraise_context('while parsing "%s"', text[cslice].strip())
            h.set_one(name, v)

        yield h