Skip to content

pdr

fastread(fp: Union[str, Path], debug: bool = False, search_paths: Union[Collection[str], str] = (), **kwargs) -> Data

Read a file with PDR, with the assumption that the label is either attached to fp or that fp is itself a detached label file, and ignoring the usual double-check for fp's actual existence in the filesystem. Intended for cases when you want access to a product's metadata very quickly and you know exactly where its label is.

Source code in pdr/__init__.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def fastread(
    fp: Union[str, Path],
    debug: bool = False,
    search_paths: Union[Collection[str], str] = (),
    **kwargs
) -> Data:
    """
    Read a file with PDR, with the assumption that the label is either
    attached to `fp` or that `fp` is itself a detached label file, and ignoring
    the usual double-check for `fp`'s actual existence in the filesystem.
    Intended for cases when you want access to a product's metadata very
    quickly and you know exactly where its label is.
    """
    return read(fp, debug, fp, search_paths, True, **kwargs)

read(fp: Union[str, Path], debug: bool = False, label_fn: Optional[Union[Path, str]] = None, search_paths: Union[Collection[str], str] = (), skip_existence_check: bool = False, **kwargs) -> Data

Read a data product with PDR. fn can be any file associated with the product, preferably a detached label file if it exists. Returns a Data object that provides an interface to the data and metadata in all available files associated with the product.

Source code in pdr/__init__.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def read(
    fp: Union[str, Path],
    debug: bool = False,
    label_fn: Optional[Union[Path, str]] = None,
    search_paths: Union[Collection[str], str] = (),
    skip_existence_check: bool = False,
    **kwargs
) -> Data:
    """
    Read a data product with PDR. `fn` can be any file associated with the
    product, preferably a detached label file if it exists. Returns a Data
    object that provides an interface to the data and metadata in all available
    files associated with the product.
    """
    return Data(
        fp,
        debug=debug,
        label_fn=label_fn,
        search_paths=search_paths,
        skip_existence_check=skip_existence_check,
        **kwargs
    )

_scaling

find_special_constants(data: PDRLike, obj: np.ndarray, name: str) -> dict[str, Number]

attempts to find special constants in an ndarray associated with a PDS3 object by referencing the label and "standard" special constant values.

Source code in pdr/_scaling.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def find_special_constants(
    data: PDRLike, obj: np.ndarray, name: str
) -> dict[str, Number]:
    """
    attempts to find special constants in an ndarray associated with a PDS3
    object by referencing the label and "standard" special constant values.
    """
    # NOTE: doesn't do anything for PDS4 products at present, although this
    #  may not be important; usually pds4_tools handles it.

    block = specialblock(data, name)
    # check for explicitly-defined special constants
    specials = {
        name: block[name]
        for name in PDS3_CONSTANT_NAMES
        if (name in block.keys()) and not (block[name] == "N/A")
    }
    for k in specials.keys():
        if isinstance(specials[k], Sequence):
            specials[k] = specials[k][0]
    # ignore uint8 implicit constants (0, 255) for now -- too problematic
    # TODO: maybe add an override
    if obj.dtype.name == "uint8":
        return specials
    # check for implicit constants appropriate to the sample type
    implicit_possibilities = IMPLICIT_PDS3_CONSTANTS[obj.dtype.name]
    # can't check for nans with "in" because it's an equality check, so
    # we don't intend this to be used, just want to make the key and put
    # in a value that won't conflict later
    if np.any(~np.isfinite(obj.data)):
        specials["INVALIDS"] = np.nan
    return specials | {
        possibility: constant
        for possibility, constant in implicit_possibilities.items()
        if constant in obj
    }

fit_to_scale(arr: np.ndarray, scale: Union[Integral, Real], offset: Union[Integral, Real]) -> np.ndarray

Return a version of arr cast to the minimum dtype that will hold its range of values after multiplying by offset and adding scale.

Supports:

float32, float64, uint8, int8, uint16, int16, uint32, int32, uint64, int64.

Source code in pdr/_scaling.py
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def fit_to_scale(
    arr: np.ndarray,
    scale: Union[Integral, Real],
    offset: Union[Integral, Real]
) -> np.ndarray:
    """
    Return a version of `arr` cast to the minimum dtype that will hold its
    range of values after multiplying by `offset` and adding `scale`.

    Supports:

    float32, float64, uint8, int8, uint16, int16, uint32, int32, uint64, int64.
    """
    if arr.dtype.char not in 'bBhHiIlLqQnNpPf':
        raise TypeError(f"This function does not support {arr.dtype.name}")
    if arr.dtype.char in 'fd' or int(scale + offset) != scale + offset:
        bases, widths, infofunc = ('f',), (4, 8), np.finfo
    else:
        bases, widths, infofunc = ('u', 'i'), (1, 2, 4, 8), np.iinfo
    amin, amax = map(int, (arr.min(), arr.max()))
    smin, smax = amin * scale + offset, amax * scale + offset
    for base, width in product(bases, widths):
        candidate = np.dtype(f'{base}{width}')
        cinfo = infofunc(candidate)
        if smin >= cinfo.min and smax <= cinfo.max:
            return arr.astype(candidate)
    raise TypeError("Unable to find a suitable data type for scaling.")

mask_specials(obj, specials)

Source code in pdr/_scaling.py
55
56
57
58
59
60
61
62
63
64
def mask_specials(obj, specials):
    """"""
    obj = np.ma.masked_array(obj)
    if np.nan in specials:
        # masks infs and nans as well
        obj.mask = np.ma.mask_or(np.isin(obj.data, specials),
                                 ~np.isfinite(obj.data))
    else:
        obj.mask = np.isin(obj.data, specials)
    return obj

scale_array(meta: PDRLike, obj: np.ndarray, object_name: str, inplace: bool = False, float_dtype: Optional['np.dtype'] = None)

Source code in pdr/_scaling.py
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
def scale_array(
    meta: PDRLike,
    obj: np.ndarray,
    object_name: str,
    inplace: bool = False,
    float_dtype: Optional["np.dtype"] = None,
):
    """"""
    from pdr.formats.checkers import specialblock

    block = specialblock(meta, object_name)
    scale, offset = 1, 0
    if "SCALING_FACTOR" in block.keys():
        scale = block["SCALING_FACTOR"]
        if isinstance(scale, dict):
            scale = scale["value"]
        if isinstance(scale, str):
            # this would be incorrect label formatting but catching here is
            # better than lots of special cases
            if scale.strip().upper() in {"NULL", "N/A", ""}:
                scale = 1
    if "OFFSET" in block.keys():
        offset = block["OFFSET"]
        if isinstance(offset, dict):
            offset = offset["value"]
        if isinstance(offset, str):
            if offset.strip().upper() in {"NULL", "N/A", ""}:
                offset = 0
    # meaningfully better for enormous unscaled arrays
    if (scale == 1) and (offset == 0):
        return obj
    # try to perform the operation in-place if requested, although if
    # we're casting to float, we can't
    # TODO: detect rollover cases, etc.
    if inplace is True and not casting_to_float(obj, scale, offset):
        return overflow_wrap(_inplace_scale)(obj, offset, scale)
    # if we're casting to float, permit specification of dtype
    # prior to operation (float64 is numpy's default and often excessive)
    if casting_to_float(obj, scale, offset):
        if float_dtype is not None:
            obj = obj.astype(float_dtype)
    return overflow_wrap(_copy_scale)(obj, offset, scale)

scale_pds4_tools_struct(struct: object) -> np.ndarray

see pds4_tools.reader.read_arrays.new_array

Source code in pdr/_scaling.py
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
def scale_pds4_tools_struct(struct: object) -> np.ndarray:
    """see pds4_tools.reader.read_arrays.new_array"""
    # TODO: apply bit_mask
    from pdr.pds4_tools.reader.data_types import apply_scaling_and_value_offset

    array = struct.data
    element_array = struct.meta_data["Element_Array"]
    scale_kwargs = {
        "scaling_factor": element_array.get("scaling_factor"),
        "value_offset": element_array.get("value_offset"),
    }
    # TODO: is this important?
    #     dtype = pds_to_numpy_type(struct.meta_data.data_type(),
    #     data=array, **scale_kwargs)
    special_constants = struct.meta_data.get("Special_Constants")
    array = apply_scaling_and_value_offset(
        array, special_constants=special_constants, **scale_kwargs
    )
    if hasattr(array, "mask"):
        return np.ma.masked_array(np.asarray(array.data), array.mask)
    return np.asarray(array)

bit_handling

utilities for parsing BIT_COLUMN objects in tables.

convert_byte_column_to_bits(byte_column: pd.Series, byte_order: ByteOrder) -> pd.Series

Converts byte strings in a Series into binary strings (e.g. b"" -> "10"). All elements of the Series must be byte strings, and all of them must have the same length.

Source code in pdr/bit_handling.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def convert_byte_column_to_bits(
    byte_column: pd.Series, byte_order: ByteOrder
) -> pd.Series:
    """
    Converts byte strings in a Series into binary strings
    (e.g. b"\x02" -> "10"). All elements of the Series must be byte strings,
    and all of them must have the same length.
    """
    dtype = factor_to_dtype(len(byte_column.iloc[0]), byte_order)
    # jam the byte strings together and construct an integer ndarray from them
    byte_array = np.frombuffer(b"".join(byte_column.tolist()), dtype=dtype)
    bytedf = pd.DataFrame.from_records(byte_array)
    bit_series = []
    # noinspection PyTypeChecker
    for rec_ix in range(len(dtype)):
        bit_series.append(
            bytedf[str(rec_ix)]
            # convert to bin
            .map(bin)
            # cut off the '0b'
            .str.slice(2, None)
            # make sure they're fixed-length
            .str.zfill(dtype[rec_ix].itemsize * 8)
        )
    # TODO: should probably be a single pd.concat operation
    bits = reduce(add, bit_series)
    return bits

convert_to_full_bit_string(table: pd.DataFrame, fmtdef: pd.DataFrame) -> pd.DataFrame

Converts the elements of a DataFrame's bit string columns from bytes to binary strings (e.g. '00100011').

Source code in pdr/bit_handling.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def convert_to_full_bit_string(
    table: pd.DataFrame, fmtdef: pd.DataFrame
) -> pd.DataFrame:
    """
    Converts the elements of a DataFrame's bit string columns from bytes to
    binary strings (e.g. '00100011').
    """
    for column in fmtdef.start_bit_list.dropna().index:
        # if it's not a list, that means the table column represented by this
        # fmtdef row isn't a bit string.
        if isinstance(fmtdef.start_bit_list[column], list):
            byte_column = table[fmtdef.NAME[column]]
            byte_order = determine_byte_order(fmtdef.DATA_TYPE[column])
            bit_str_column = convert_byte_column_to_bits(
                byte_column, byte_order
            )
            table[fmtdef.NAME[column]] = bit_str_column
    return table

expand_bit_strings(table: pd.DataFrame, fmtdef: pd.DataFrame) -> pd.DataFrame

Top-level handler function for the bit column workflow. Converts a binary table's bit string columns (if any) from raw bytes to lists of strings (e.g. ['0010, 0011']).

Source code in pdr/bit_handling.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
def expand_bit_strings(
    table: pd.DataFrame, fmtdef: pd.DataFrame
) -> pd.DataFrame:
    """
    Top-level handler function for the bit column workflow. Converts a binary
    table's bit string columns (if any) from raw bytes to lists of strings
    (e.g. ['0010, 0011']).
    """
    # bit_handling.get_bit_start_and_size() defines this column, and
    # handlers.add_bit_column_info() adds it.
    if "start_bit_list" not in fmtdef.columns:
        return table
    table = convert_to_full_bit_string(table, fmtdef)
    return splice_bit_string(table, fmtdef)

factor_to_dtype(field_length: int, byte_order: ByteOrder) -> np.dtype

Determine the smallest (in terms of length) structured dtype composed of unsigned integer dtypes that can parse binary blob of a particular length and byteorder into a list of bytes. Optimizing the dtype length here reduces the number of times we have to call bin() in convert_byte_column_to_bits(), which is one of the biggest performance bottlenecks in this module.

Source code in pdr/bit_handling.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def factor_to_dtype(field_length: int, byte_order: ByteOrder) -> np.dtype:
    """
    Determine the smallest (in terms of length) structured dtype composed of
    unsigned integer dtypes that can parse binary blob of a particular length
    and byteorder into a list of bytes. Optimizing the dtype length here
    reduces the number of times we have to call `bin()` in
    `convert_byte_column_to_bits()`, which is one of the biggest performance
    bottlenecks in this module.
    """
    lengths = [1, 2, 4, 8]
    if field_length in lengths:
        # if it fits within a simple dtype, great
        return np.dtype([("0", f"{byte_order}u{field_length}")])
    dtype, remaining_length = [], field_length
    n = 0
    while remaining_length > 0:
        if remaining_length - lengths[-1] < 0:
            lengths.pop()
            continue
        dtype.append((str(n), f"{byte_order}u{lengths[-1]}"))
        n += 1
        remaining_length -= lengths[-1]
    return np.dtype(dtype)

get_bit_start_and_size(obj: dict, definition: MultiDict, identifiers: DataIdentifiers) -> dict

Parse the BIT_COLUMN information from a MultiDict that represents a COLUMN definition into lists of bit string start positions and sizes that can later be used to parse byte strings into bit strings, then add that information to a parsed column definition. A subcomponent of the queries.read_format_block() workflow.

Source code in pdr/bit_handling.py
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
def get_bit_start_and_size(
    obj: dict, definition: MultiDict, identifiers: DataIdentifiers
) -> dict:
    """
    Parse the BIT_COLUMN information from a MultiDict that represents a COLUMN
    definition into lists of bit string start positions and sizes that can
    later be used to parse byte strings into bit strings, then add that
    information to a parsed column definition. A subcomponent of the
    `queries.read_format_block()` workflow.
    """
    start_bit_list = []
    bit_size_list = []
    list_of_pvl_objects_for_bit_columns = definition.getall("BIT_COLUMN")
    for pvl_obj in list_of_pvl_objects_for_bit_columns:
        if pvl_obj.get("ITEMS"):
            items = pvl_obj.get("ITEMS")
            item_bits = pvl_obj.get("ITEM_BITS")
            first_item_start_bit = pvl_obj.get("START_BIT")
            for item_index in range(items):
                start_bit = first_item_start_bit + item_index * item_bits
                start_bit_list.append(start_bit)
                bit_size_list.append(item_bits)
        else:
            start_bit = pvl_obj.get("START_BIT")
            bit_size = pvl_obj.get("BITS")
            start_bit_list.append(start_bit)
            bit_size_list.append(bit_size)
    is_also_special, special_start_bit_list = check_special_bit_start_case(
        identifiers, list_of_pvl_objects_for_bit_columns, start_bit_list
    )
    if is_also_special:
        obj["start_bit_list"] = special_start_bit_list
    else:
        obj["start_bit_list"] = start_bit_list
    obj["bit_size_list"] = bit_size_list
    return obj

set_bit_string_data_type(obj: dict, identifiers: Mapping[str, Any]) -> dict

Infer a bit string column's data type and add it to obj (a parsed column definition). A subcomponent of the queries.read_format_block() workflow.

Source code in pdr/bit_handling.py
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
def set_bit_string_data_type(
    obj: dict, identifiers: Mapping[str, Any]
) -> dict:
    """
    Infer a bit string column's data type and add it to `obj` (a parsed column
    definition). A subcomponent of the `queries.read_format_block()` workflow.
    """
    is_special, special_dtype = check_special_bit_column_case(identifiers)
    if is_special is False:
        try:
            byteorder = sample_types(
                obj["BIT_COLUMN"]["BIT_DATA_TYPE"], 1, True
            )[0]
        except (KeyError, ValueError):
            raise ValueError("Incompatible data type for bit columns.")
        if byteorder == ">":
            warnings.warn(
                f"Data type {obj['DATA_TYPE']} incompatible for bit column. "
                f"Changing to MSB_BIT_STRING."
            )
            obj["DATA_TYPE"] = "MSB_BIT_STRING"
        elif byteorder == "<":
            warnings.warn(
                f"Data type {obj['DATA_TYPE']} incompatible for bit column. "
                f"Changing to LSB_BIT_STRING."
            )
            obj["DATA_TYPE"] = "LSB_BIT_STRING"
    else:
        obj["DATA_TYPE"] = special_dtype
    return obj

splice_bit_string(table: pd.DataFrame, fmtdef: pd.DataFrame) -> pd.DataFrame

Split the elements of a table's bit string columns into lists of binary strings according to the bit boundaries specified in the label. This function expects to be called after convert_to_full_bit_string(), because the columns must already have been converted into binary strings.

Source code in pdr/bit_handling.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def splice_bit_string(
    table: pd.DataFrame, fmtdef: pd.DataFrame
) -> pd.DataFrame:
    """
    Split the elements of a table's bit string columns into lists of binary
    strings according to the bit boundaries specified in the label. This
    function expects to be called after convert_to_full_bit_string(), because
    the columns must already have been converted into binary strings.
    """
    for column in fmtdef.start_bit_list.dropna().index:
        if isinstance(fmtdef.start_bit_list[column], list):
            bit_column = table[fmtdef.NAME[column]]
            start_bit_list = [
                val - 1 for val in fmtdef.start_bit_list[column]
            ]  # python zero indexing
            bit_size_list = fmtdef.bit_size_list[column]
            bit_list_column = bit_column.map(
                partial(
                    split_bits,
                    start_bit_list=start_bit_list,
                    bit_size_list=bit_size_list,
                )
            )
            table[fmtdef.NAME[column]] = bit_list_column
    return table

split_bits(bit_string: Sequence, start_bit_list: Sequence[int], bit_size_list: Sequence[int]) -> list

Split a sequence into a list of subsequences based on start and size specifications. Intended here to be used on binary strings.

Source code in pdr/bit_handling.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
def split_bits(
    bit_string: Sequence,
    start_bit_list: Sequence[int],
    bit_size_list: Sequence[int]
) -> list:
    """
    Split a sequence into a list of subsequences based on start and size
    specifications. Intended here to be used on binary strings.
    """
    end_bit_list = [
        start + size for start, size in zip(start_bit_list, bit_size_list)
    ]
    return [
        bit_string[start:end]
        for start, end in zip(start_bit_list, end_bit_list)
    ]

browsify

functions for producing browse versions of products

_browsify_array(obj: np.ndarray, outbase: str, purge: bool = False, image_clip: Union[float, tuple[float, float], None] = None, mask_color: Optional[tuple[int, int, int]] = (0, 255, 255), band_ix: Optional[int] = None, save: bool = True, override_rgba: bool = False, image_format: str = 'jpg', slice_axis: int = 0, rgb_channels: Optional[tuple[int, int, int]] = None, **_) -> 'Union[Image.Image, list[Optional[Image.Image]]]'

Attempt to render (and optionally save) an ndarray as one or more images.

Source code in pdr/browsify.py
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
def _browsify_array(
    obj: np.ndarray,
    outbase: str,
    purge: bool = False,
    image_clip: Union[float, tuple[float, float], None] = None,
    mask_color: Optional[tuple[int, int, int]] = (0, 255, 255),
    band_ix: Optional[int] = None,
    save: bool = True,
    override_rgba: bool = False,
    image_format: str = "jpg",
    slice_axis: int = 0,
    rgb_channels: Optional[tuple[int, int, int]] = None,
    **_,
) -> 'Union[Image.Image, list[Optional[Image.Image]]]':
    """
    Attempt to render (and optionally save) an ndarray as one or more
    images.
    """
    nice_clip = image_clip is None
    image_clip = (1, 1) if image_clip is None else image_clip
    if len(obj.shape) == 3:
        obj = _format_multiband_image(
            obj, band_ix, override_rgba, slice_axis, rgb_channels
        )
    if not isinstance(obj, tuple):
        return _render_array(
            obj,
            outbase,
            purge,
            image_clip,
            mask_color,
            save,
            image_format,
            nice_clip
        )
    results = []
    for ix, band in enumerate(obj):
        result = _render_array(
            band,
            f"{outbase}_{ix}",
            purge,
            image_clip,
            mask_color,
            save,
            image_format,
            nice_clip
        )
        results.append(result)
    return results

_browsify_recarray(obj: np.recarray, outbase: str, **_)

Some tabular data with column groups ends up as numpy recarray, which is challenging to turn into a useful .csv file in some cases. This tries to save it as a CSV file, and if it fails, punts and pickles it.

Source code in pdr/browsify.py
192
193
194
195
196
197
198
199
200
201
202
203
def _browsify_recarray(obj: np.recarray, outbase: str, **_):
    """
    Some tabular data with column groups ends up as numpy recarray, which is
    challenging to turn into a useful .csv file in some cases. This _tries_ to
    save it as a CSV file, and if it fails, punts and pickles it.
    """
    try:
        obj = pd.DataFrame.from_records(obj)
        # noinspection PyTypeChecker
        obj.to_csv(outbase + ".csv")
    except ValueError:
        pickle.dump(obj, open(outbase + "_nested_recarray.pkl", "wb"))

_format_as_rgb(obj, rgb_channels)

Source code in pdr/browsify.py
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
def _format_as_rgb(obj, rgb_channels):
    """"""
    if rgb_channels is not None:
        if isinstance(obj, np.ma.MaskedArray):
            return np.ma.dstack([
                obj[rgb_channels[0]], 
                obj[rgb_channels[1]], 
                obj[rgb_channels[2]]
            ])
        else:
            return np.dstack([
                obj[rgb_channels[0]], 
                obj[rgb_channels[1]], 
                obj[rgb_channels[2]]
            ])
    if isinstance(obj, np.ma.MaskedArray):
        return np.ma.dstack([channel for channel in obj[0:3]])
    else:
        return np.dstack([channel for channel in obj[0:3]])

_format_as_single_band(band_ix, obj)

for multiband arrays that are not presumably rgb(a), or if we have been instructed to by the override_rgba argument, only export a single band.

Source code in pdr/browsify.py
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
def _format_as_single_band(band_ix, obj):
    """
    for multiband arrays that are not presumably rgb(a), or if we have been
    instructed to by the override_rgba argument, only export a single band.
    """
    middle_ix = round(obj.shape[0] / 2)
    if band_ix is None:
        # by default, dump the middle band.
        warnings.warn(f"dumping only band {middle_ix} of this image")
        return obj[middle_ix]
    # if the band_ix argument has been passed, dump that band if possible
    try:
        return obj[band_ix]
    except IndexError:
        warnings.warn(
            f"band_ix={band_ix} does not exist, dumping band {middle_ix}"
        )
        return obj[middle_ix]

_format_multiband_image(obj, band_ix, override_rgba, slice_axis, rgb_channels)

helper function for _browsify_array -- truncate, stack, or burst multiband images and send for further processing.

Source code in pdr/browsify.py
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
def _format_multiband_image(
    obj, 
    band_ix, 
    override_rgba, 
    slice_axis, 
    rgb_channels
):
    """
    helper function for _browsify_array -- truncate, stack, or burst
    multiband images and send for further processing.
    """
    if slice_axis != 0:
        obj = obj.swapaxes(0, slice_axis)
    if rgb_channels is not None:
        return _format_as_rgb(obj, rgb_channels)
    if (obj.shape[0] not in (3, 4)) or (override_rgba is True):
        if band_ix == "burst":
            return tuple([obj[ix] for ix in range(obj.shape[0])])
        return _format_as_single_band(band_ix, obj)
    # treat 3/4 band arrays as rgb(a) images
    if band_ix is not None:
        warnings.warn(
            "treating image as RGB & ignoring band_ix argument; "
            "pass override_rgba=True to override this behavior"
        )
    if obj.shape[0] == 4:
        warnings.warn(
            "transparency not supported, removing 4th (alpha) channel"
        )
    return _format_as_rgb(obj, rgb_channels)

_render_array(obj: np.ndarray, outbase: str, purge: bool, image_clip: Union[float, tuple[float, float]], mask_color: Union[int, tuple[int, int, int]], save: bool, image_format: str, nice_clip: bool) -> 'Optional[Image.Image]'

Handler function for array-rendering pipeline, used by browsify() on most ndarrays and by show() always. Render an ndarray as a PIL Image, optionally clipping and masking it. If save is True, save it to disk; if False, return it.

Source code in pdr/browsify.py
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
def _render_array(
    obj: np.ndarray,
    outbase: str,
    purge: bool,
    image_clip: Union[float, tuple[float, float]],
    mask_color: Union[int, tuple[int, int, int]],
    save: bool,
    image_format: str,
    nice_clip: bool
) -> 'Optional[Image.Image]':
    """
    Handler function for array-rendering pipeline, used by `browsify()` on
    most ndarrays and by `show()` always. Render an ndarray as a PIL Image,
    optionally clipping and masking it. If `save` is True, save it to disk;
    if False, return it.
    """
    try:
        from PIL import Image
    except ImportError:
        raise ModuleNotFoundError(
            "Rendering browse images requires the optional pillow dependency."
        )

    # upcast integer data types < 32-bit to prevent unhelpful wraparound
    if (obj.dtype.char in np.typecodes["AllInteger"]) and (obj.itemsize <= 2):
        obj = obj.astype(np.int32)
    # convert to unsigned eight-bit integer to make it easy to write
    obj = eightbit(obj, image_clip, purge, nice_clip)
    # unless color_fill is set to None, fill masked elements -- probably
    # special constants -- with RGB value defined by mask_color
    if isinstance(obj, np.ma.MaskedArray) and (mask_color is not None):
        obj = colorfill_maskedarray(obj, mask_color)
    image = Image.fromarray(obj)
    # TODO: this might be an excessively hacky way to implement Data.show(),
    #  probably split off the image-generating stuff above into a separate
    #  function
    if save is False:
        return image
    if max(obj.shape) > 65500:
        scale = 1
        for n in naturals():
            scale = 1 / n
            if max(obj.shape) * scale <= 65500:
                break
        warnings.warn(
            f"Axis length {max(obj.shape)} > JPEG encoder threshold of "
            f"65500; downsampling browse image to {scale * 100}%."
        )
        image.thumbnail([int(axis * scale) for axis in image.size])
    image.save(f"{outbase}.{image_format}")

browsify(obj: Any, outbase: Union[str, Path], **dump_kwargs) -> None

attempts to dump a browse version of a data object, writing it into a file type that can be opened with desktop software: .jpg for most arrays, .csv for tables, .txt for most other things. if it can't find a reasonable translation, it attempts to dump it as .pkl (a serialized binary 'blob').

Source code in pdr/browsify.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
def browsify(obj: Any, outbase: Union[str, Path], **dump_kwargs) -> None:
    """
    attempts to dump a browse version of a data object, writing it into a file
    type that can be opened with desktop software: .jpg for most arrays, .csv
    for tables, .txt for most other things. if it can't find a reasonable
    translation, it attempts to dump it as .pkl (a serialized binary 'blob').
    """
    outbase = str(outbase)
    if isinstance(obj, np.recarray):
        _browsify_recarray(obj, outbase, **dump_kwargs)
    elif isinstance(obj, np.ndarray):
        if len(obj.shape) == 1:
            pd.DataFrame(obj).to_csv(outbase + ".csv", index=False)
        else:
            _browsify_array(obj, outbase, **dump_kwargs)
    elif isinstance(obj, pd.DataFrame):
        if len(obj) == 1:
            # noinspection PyTypeChecker
            obj.T.to_csv(outbase + ".csv"),
        else:
            obj.to_csv(outbase + ".csv")
    elif obj is None:
        return
    elif "to_string" in dir(obj):  # probably an XML ElementTree interface
        with open(outbase + ".xml", "w") as stream:
            stream.write(obj.to_string())
    else:
        # this should usually work. it may need another backup binary blob
        # pickler for really weird binary objects.
        with open(outbase + ".txt", "w") as stream:
            stream.write(str(obj))

colorfill_maskedarray(masked_array: np.ma.MaskedArray, color: Union[int, tuple[int, int, int]] = (0, 255, 255)) -> np.ndarray

masked_array: 2-D masked array or a 3-D masked array with last axis of length 3. for likely uses, this should probably be 8-bit unsigned integer. color: optionally-specified RGB color (default cyan) return a 2-D or 3-D array with masked values filled with color.

Source code in pdr/browsify.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
def colorfill_maskedarray(
    masked_array: np.ma.MaskedArray,
    color: Union[int, tuple[int, int, int]] = (0, 255, 255),
) -> np.ndarray:
    """
    masked_array: 2-D masked array or a 3-D masked array with last axis of
    length 3. for likely uses, this should probably be 8-bit unsigned integer.
    color: optionally-specified RGB color (default cyan)
    return a 2-D or 3-D array with masked values filled with color.
    """
    if isinstance(color, int):
        return masked_array.filled(color)
    if len(masked_array.shape) == 2:
        return np.dstack([masked_array.filled(color[ix]) for ix in range(3)])
    if masked_array.shape[-1] != 3:
        raise ValueError("3-D arrays must have last axis of length = 3")
    return np.dstack(
        [masked_array[:, :, ix].filled(color[ix]) for ix in range(3)]
    )

eightbit(array: np.array, clip: Union[float, tuple[float, float]] = 0, inplace: bool = False, nice_clip: bool = False) -> np.ndarray

return an eight-bit version of an array, optionally clipped at min/max percentiles. if inplace is True, normalization may transform the original array, with attendant memory savings and destructiveness.

Source code in pdr/browsify.py
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
def eightbit(
    array: np.array,
    clip: Union[float, tuple[float, float]] = 0,
    inplace: bool = False,
    nice_clip: bool = False
) -> np.ndarray:
    """
    return an eight-bit version of an array, optionally clipped at min/max
    percentiles. if inplace is True, normalization may transform the original
    array, with attendant memory savings and destructiveness.
    """
    with warnings.catch_warnings():
        # we do not care about masked out-of-bounds values; it's why we masked
        warnings.filterwarnings(action="ignore", message=".*invalid value en*")
        return np.round(
            normalize_range(array, (0, 255), clip, inplace, nice_clip)
        ).astype(np.uint8)

find_masked_bounds(image: np.ma.MaskedArray, cheat_low: int, cheat_high: int) -> tuple[Optional[Number], Optional[Number]]

relatively memory-efficient way to perform bound calculations for normalize_range on a masked array.

Source code in pdr/browsify.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def find_masked_bounds(
    image: np.ma.MaskedArray, cheat_low: int, cheat_high: int
) -> tuple[Optional[Number], Optional[Number]]:
    """
    relatively memory-efficient way to perform bound calculations for
    normalize_range on a masked array.
    """
    valid = image[~image.mask].data
    if valid.size == 0:
        return None, None
    if (cheat_low != 0) and (cheat_high != 0):
        minimum, maximum = np.percentile(
            valid, [cheat_low, 100 - cheat_high], overwrite_input=True
        ).astype(image.dtype)
    elif cheat_low != 0:
        maximum = valid.max()
        minimum = np.percentile(valid, cheat_low, overwrite_input=True).astype(
            image.dtype
        )
    elif cheat_high != 0:
        minimum = valid.min()
        maximum = np.percentile(
            valid, 100 - cheat_high, overwrite_input=True
        ).astype(image.dtype)
    else:
        minimum = valid.min()
        maximum = valid.max()
    return minimum, maximum

find_unmasked_bounds(image: np.ndarray, cheat_low: int, cheat_high: int) -> tuple[Number, Number]

straightforward way to find unmasked array bounds for normalize_range

Source code in pdr/browsify.py
49
50
51
52
53
54
55
56
57
58
59
60
61
def find_unmasked_bounds(
    image: np.ndarray, cheat_low: int, cheat_high: int
) -> tuple[Number, Number]:
    """straightforward way to find unmasked array bounds for normalize_range"""
    if cheat_low != 0:
        minimum = np.percentile(image, cheat_low).astype(image.dtype)
    else:
        minimum = image.min()
    if cheat_high != 0:
        maximum = np.percentile(image, 100 - cheat_high).astype(image.dtype)
    else:
        maximum = image.max()
    return minimum, maximum

normalize_range(image: np.ndarray, bounds: Sequence[int] = (0, 1), clip: Union[float, tuple[float, float]] = 0, inplace: bool = False, nice_clip: bool = False) -> np.ndarray

simple linear min-max scaler that optionally percentile-clips the input at clip = (low_percentile, 100 - high_percentile). if inplace is True, may transform the original array, with attendant memory savings and destructive effects.

Source code in pdr/browsify.py
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def normalize_range(
    image: np.ndarray,
    bounds: Sequence[int] = (0, 1),
    clip: Union[float, tuple[float, float]] = 0,
    inplace: bool = False,
    nice_clip: bool = False
) -> np.ndarray:
    """
    simple linear min-max scaler that optionally percentile-clips the input at
    clip = (low_percentile, 100 - high_percentile). if inplace is True,
    may transform the original array, with attendant memory savings and
    destructive effects.
    """
    if isinstance(clip, Sequence):
        cheat_low, cheat_high = clip
    else:
        cheat_low, cheat_high = (clip, clip)
    range_min, range_max = bounds
    if isinstance(image, np.ma.MaskedArray):
        minimum, maximum = find_masked_bounds(image, cheat_low, cheat_high)
        if minimum is None:
            return image
    else:
        minimum, maximum = find_unmasked_bounds(image, cheat_low, cheat_high)
    if nice_clip is True and minimum == maximum:  # scaling was too intense
        return image
    if not ((cheat_high is None) and (cheat_low is None)):
        if inplace is True:
            image = np.clip(image, minimum, maximum, out=image)
        else:
            image = np.clip(image, minimum, maximum)
    if inplace is True:
        # perform the operation in-place
        image -= minimum
        image *= range_max - range_min
        if image.dtype.char in np.typecodes["AllInteger"]:
            # this loss of precision is probably better than
            # automatically typecasting it.
            # TODO: detect rollover cases, etc.
            image //= maximum - minimum
        else:
            image /= maximum - minimum
        image += range_min
        return image
    return (
        (image - minimum) *
        ((range_max - range_min) / (maximum - minimum))
        + range_min
    )

datatypes

definitions of sample types / data types / dtypes / ctypes, file formats and extensions, associated special constants, and so on.

IMPLICIT_PDS3_CONSTANTS = MappingProxyType({'uint8': {'NULL': 0, 'ISIS_SAT_HIGH': 255}, 'int8': {}, 'int16': {'N/A': -32768, 'UNK': 32767, 'ISIS_LOW_INST_SAT': -32766, 'ISIS_LOW_REPR_SAT': -32767, 'ISIS_HIGH_INST_SAT': -32765, 'ISIS_HIGH_REPR_SAT': -32764}, 'uint16': {'NULL': 0, 'N/A': 65533, 'UNK': 65534, 'ISIS_LOW_INST_SAT': 2, 'ISIS_LOW_REPR_SAT': 1, 'ISIS_HIGH_INST_SAT': 65534, 'ISIS_HIGH_REPR_SAT': 65535}, 'int32': {'N/A': -214743648, 'UNK': 2147483647}, 'int64': {'N/A': -214743648, 'UNK': 2147483647}, 'uint32': {'N/A': 4294967293, 'UNK': 4294967294, 'ISIS_NULL': read_hex('FF7FFFFB', '>I'), 'ISIS_LOW_INST_SAT': read_hex('FF7FFFFD', '>I'), 'ISIS_LOW_REPR_SAT': read_hex('FF7FFFFC', '>I'), 'ISIS_HIGH_INST_SAT': read_hex('FF7FFFFE', '>I'), 'ISIS_HIGH_REPR_SAT': read_hex('FF7FFFFF', '>I')}, 'float32': {'NULL': -3.4028226550889045e+38, 'N/A': -1e+32, 'UNK': 1e+32, 'ISIS_LOW_INST_SAT': read_hex('FF7FFFFD', '>f'), 'ISIS_LOW_REPR_SAT': read_hex('FF7FFFFC', '>f'), 'ISIS_HIGH_INST_SAT': read_hex('FF7FFFFE', '>f'), 'ISIS_HIGH_REPR_SAT': read_hex('FF7FFFFF', '>f')}, 'float64': {'NULL': -3.4028226550889045e+38}}) module-attribute

This constant defines common "implicit" (not specified in the label) PDS3 special constants. Its keys are bits per array element. Some of these constants are derived from ISIS (although sometimes used in products that were not generated by ISIS!); others are suggested in the PDS3 Standards.

Note that the Standards specifically permit other special constants to exist, undefined in the label, and determined only by the operating environment of the data provider, so there can be no guarantee that other special constants do not exist in any particular product.

The "implicit" use of ISIS constants may in fact be illegal, but appears common. also note that some ISIS values collide with Standards-specified N/A / UNK / NULL values -- again, we have no way to automatically distinguish them, and interpret them as the Standards values when we find them unless a label specifically states otherwise.

References: PDS3 Standards Reference v3.8, p.172 (https://pds.nasa.gov/datastandards/pds3/standards/sr/StdRef_20090227_v3.8.pdf) GDAL PDS3 driver TODO: -32768 is noted in this driver as NULL but defined in the Standards as an N/A value -- should clarify (https://github.com/OSGeo/gdal/blob/master/frmts/pds/pdsdataset.cpp) ISIS special pixel values (https://isis.astrogeology.usgs.gov/Object/Developer/_special_pixel_8h_source.html)

PDS3_CONSTANT_NAMES = tuple(PDS3_ISIS_CONSTANT_NAMES + PDS3_CONSTANT_NAMES) module-attribute

basic" PDS3 special constant parameter names

PDS3_ISIS_CONSTANT_NAMES = tuple([f'{category}{direction}{entity}{prop}' for category, direction, entity, prop in (product(('CORE_', 'BAND_SUFFIX_', 'SAMPLE_SUFFIX_', 'LINE_SUFFIX_', ''), ('HIGH_', 'LOW_', ''), ('INST_', 'REPR_', ''), ('NULL', 'SATURATION', 'SAT')))]) module-attribute

some (all?) of these special constants are derived from ISIS properties; these are names they take on when they are made explicit in a PDS3 label

determine_byte_order(sample_type: str) -> ByteOrder

defines generic byte order for PDS3 physical data types

Source code in pdr/datatypes.py
42
43
44
45
46
def determine_byte_order(sample_type: str) -> ByteOrder:
    """defines generic byte order for PDS3 physical data types"""
    if any(sample_type.startswith(s) for s in ("PC", "LSB", "VAX")):
        return "<"
    return ">"

integer_code(byteorder: ByteOrder, signed: bool, sample_bytes: int, for_numpy: bool = False) -> str

Translation from integer width, signedness, and byteorder to struct or numpy dtype string.

Source code in pdr/datatypes.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def integer_code(
    byteorder: ByteOrder,
    signed: bool,
    sample_bytes: int,
    for_numpy: bool = False
) -> str:
    """
    Translation from integer width, signedness, and byteorder to struct or
    numpy dtype string.
    """
    # TODO: add struct letter for longlong
    if sample_bytes == 4:
        letter = "l"
    elif sample_bytes == 2:
        letter = "h"
    else:
        letter = "b"
    if signed is False:
        letter = letter.upper()
    if for_numpy is True and sample_bytes in (4, 8):
        letter = f"i{sample_bytes}" if signed is True else f"u{sample_bytes}"

    return f"{byteorder}{letter}"

sample_types(sample_type: str, sample_bytes: int, for_numpy: bool = False) -> str

Defines a translation from PDS3 physical data types to Python struct or numpy dtype format strings, using both the type and byte width specified (because the mapping to type alone is not consistent across PDS3).

Source code in pdr/datatypes.py
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def sample_types(
    sample_type: str, sample_bytes: int, for_numpy: bool = False
) -> str:
    """
    Defines a translation from PDS3 physical data types to Python struct or
    numpy dtype format strings, using both the type and byte width specified
    (because the mapping to type alone is not consistent across PDS3).
    """
    sample_type = sample_type.replace(" ", "_")
    if (("INTEGER" in sample_type) or (sample_type == "BOOLEAN")) and (
        "ASCII" not in sample_type
    ):
        endian = determine_byte_order(sample_type)
        signed = "UNSIGNED" not in sample_type
        return integer_code(endian, signed, sample_bytes, for_numpy)
    void = "V" if for_numpy is True else "s"
    if sample_bytes == 8:
        _float = "d"
    elif sample_bytes == 4:
        _float = "f"
    elif "ASCII" in sample_type:
        _float = ""
    elif re.search("REAL|FLOAT", sample_type):
        raise NotImplementedError(
            f"{sample_bytes}-byte floats are not supported."
        )
    else:
        _float = ""
    if sample_type == "VAX_REAL" and sample_bytes != 4:
        raise NotImplementedError(
            "VAX reals that are not 4 bytes wide are not supported."
        )
    # noinspection PyUnboundLocalVariable
    return {
        "IEEE_REAL": f">{_float}",
        "PC_REAL": f"<{_float}",
        "FLOAT": f">{_float}",
        "REAL": f">{_float}",
        "MAC_REAL": f">{_float}",
        "SUN_REAL": f">{_float}",
        "MSB_BIT_STRING": f"{void}{sample_bytes}",
        "LSB_BIT_STRING": f"{void}{sample_bytes}",
        # "Character string representing a real number"
        "ASCII_REAL": f"S{sample_bytes}",
        # ASCII character string representing an integer
        "ASCII_INTEGER": f"S{sample_bytes}",
        # "ASCII character string representing a date in PDS standard format"
        # (e.g. 1990-08-01T23:59:59)
        "DATE": f"S{sample_bytes}",
        "CHARACTER": f"S{sample_bytes}",  # ASCII character string
        "TIME": f"S{sample_bytes}",
        "VOID": f"{void}{sample_bytes}",
        "BCD": f"{void}{sample_bytes}",
        "BINARY_CODED_DECIMAL": f"{void}{sample_bytes}",
        # these two (VAX_REAL and IBM_REAL) unfortunately don't work perfectly
        # cleanly -- numpy doesn't have built-in support for them, so we just
        # get the byte width/order correct here and add additional checks to
        # transform it after load. the data type used here is mostly arbitrary
        # apart from byte width and order, but it shouldn't be a float type in
        # case of platform-specific differences, numpy being excessively
        # clever, etc.
        "VAX_REAL": f"<u{sample_bytes}",
        "IBM_REAL": f">u{sample_bytes}",
        "EBCDIC": f"V{sample_bytes}",
        "EBCDIC_CHARACTER": f"V{sample_bytes}",
    }[sample_type]

errors

AlreadyLoadedError

Bases: Exception

We already loaded this object and haven't been instructed to reload it.

Source code in pdr/errors.py
4
5
6
7
8
class AlreadyLoadedError(Exception):
    """
    We already loaded this object and haven't been instructed to reload it.
    """
    pass

DuplicateKeyWarning

Bases: UserWarning

This product has duplicate object names; we're renaming them.

Source code in pdr/errors.py
11
12
13
class DuplicateKeyWarning(UserWarning):
    """This product has duplicate object names; we're renaming them."""
    pass

formats

This module implements a wide variety of special-case behaviors for nonconforming or malformatted data products. It implements these behaviors as functions in distinct submodules organized by 'dataset' (mission, instrument, etc.); the checkers submodule contains dispatch functions that preempt generic behaviors and redirect them to functions from one of the dataset submodules. See the documentation for checkers for details on this behavior.

ImageProps

Bases: TypedDict

Standard image properties dict used in image-processing workflows.

Source code in pdr/pdrtypes.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
class ImageProps(TypedDict):
    """Standard image properties dict used in image-processing workflows."""
    # Number of bytes per pixel (eventually redundant with sample_type but
    # populated much earlier)
    BYTES_PER_PIXEL: Literal[1, 2, 4, 8]
    # Do the elements of the array, when loaded, represent VAX reals?
    is_vax_real: bool
    # numpy dtype string
    sample_type: str
    # total number of elements
    pixels: int
    # number of elements along each dimension
    nrows: int
    ncols: int
    nbands: int
    # physical storage layout of 3D arrays (None for 2D arrays)
    band_storage_type: BandStorageType
    # total row/column/band pad elements due to ISIS-style axplanes
    rowpad: int
    colpad: int
    bandpad: int
    # number of pad elements for left/right sideplanes
    prefix_rows: Optional[int]
    suffix_rows: Optional[int]
    # number of pad elements for bottom/topplanes
    prefix_cols: Optional[int]
    suffix_cols: Optional[int]
    # number of pad elements for front/backplanes
    prefix_bands: Optional[int]
    suffix_bands: Optional[int]
    # total pad elements due to line prefixes/suffixes
    linepad: int
    # number of elements in line prefix and suffix
    line_prefix_pix: Optional[int]
    line_suffix_pix: Optional[int]
    # Order of axes expressed as a tuple of axis names, only used by ISIS qubes
    axnames: Optional[tuple[Axname]]

check_special_bit_column_case(identifiers: Mapping[str, Any]) -> tuple[bool, Optional[str]]

Special case checker used by bit_handling.set_bit_string_data_type() to preempt generic data type inference.

Source code in pdr/formats/checkers.py
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
def check_special_bit_column_case(
    identifiers: Mapping[str, Any]
) -> tuple[bool, Optional[str]]:
    """
    Special case checker used by `bit_handling.set_bit_string_data_type()`
    to preempt generic data type inference.
    """
    instrument = identifiers["INSTRUMENT_NAME"]
    if instrument in (
        "ALPHA PARTICLE X-RAYSPECTROMETER",
        "JOVIAN AURORAL PLASMA DISTRIBUTIONS EXPERIMENT",
        "CHEMISTRY AND MINERALOGY INSTRUMENT",
        "MARS ADVANCED RADAR FOR SUBSURFACE ANDIONOSPHERE SOUNDING",
    ):
        return True, "MSB_BIT_STRING"
    return False, None

check_special_bit_format(obj: dict, definition: MultiDict, identifiers: DataIdentifiers) -> tuple[bool, Optional[dict]]

Special case checker used by add_bit_column_info() to fix problems in obj and/or definition caused by mistakes in an external format file. Intended for cases where check_special_block() doesn't touch the relevant metadata, and errors are hit before check_special_structure() can be useful.

Source code in pdr/formats/checkers.py
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
def check_special_bit_format(
    obj: dict,
    definition: MultiDict,
    identifiers: DataIdentifiers
) -> tuple[bool, Optional[dict]]:
    """
    Special case checker used by add_bit_column_info() to fix problems in `obj` 
    and/or `definition` caused by mistakes in an external format file. Intended 
    for cases where check_special_block() doesn't touch the relevant metadata, 
    and errors are hit before check_special_structure() can be useful.
    """
    if re.match(
        r"CO-(CAL-ISS|[S/EVJ-]+ISSNA/ISSWA-2)", identifiers["DATA_SET_ID"]
    ):
        return formats.cassini.iss_telemetry_bit_col_format(obj, definition)
    if (
        identifiers["SPACECRAFT_NAME"] == "GALILEO ORBITER"
        and identifiers["INSTRUMENT_NAME"] in ("SOLID_STATE_IMAGING", 
                                               "SOLID STATE IMAGING SYSTEM")
    ):
        return formats.galileo.ssi_redr_bit_col_format(definition)
    return False, None

check_special_bit_start_case(identifiers, list_of_pvl_objects_for_bit_columns, start_bit_list) -> tuple[bool, Optional[list[int]]]

Special case checker used by get_bit_start_and_size() to fix incorrectly-defined bit offsets.

Source code in pdr/formats/checkers.py
766
767
768
769
770
771
772
773
774
775
776
777
def check_special_bit_start_case(
    identifiers, list_of_pvl_objects_for_bit_columns, start_bit_list
) -> tuple[bool, Optional[list[int]]]:
    """
    Special case checker used by get_bit_start_and_size() to fix
    incorrectly-defined bit offsets.
    """
    if identifiers["INSTRUMENT_NAME"] in "JOVIAN INFRARED AURORAL MAPPER":
        return formats.juno.bit_start_find_and_fix(
            list_of_pvl_objects_for_bit_columns, start_bit_list
        )
    return False, None

check_special_block(name: str, data: PDRLike, identifiers: Mapping) -> tuple[bool, Optional[MultiDict]]

specialize() target for queries.get_block(). Intended for cases in which label pointers don't correspond to label block names AND/OR if a value within the block needs to be changed before going to other functions.

Source code in pdr/formats/checkers.py
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
def check_special_block(
    name: str, data: PDRLike, identifiers: Mapping
) -> tuple[bool, Optional[MultiDict]]:
    """
    `specialize()` target for `queries.get_block()`. Intended for cases in
    which label pointers don't correspond to label block names AND/OR if a
    value within the block needs to be changed before going to other functions.
    """
    if name == "XDR_DOCUMENT":
        return True, formats.cassini.xdr_redirect_to_image_block(data)
    if name == "CHMN_HSK_HEADER_TABLE":
        return True, formats.msl_cmn.fix_mangled_name(data)
    if (
        identifiers["DATA_SET_ID"].startswith("JNO-E/J/SS")
        and "BSTFULL" in identifiers["DATA_SET_ID"]
        and "FREQ_OFFSET_TABLE" in data.keys()
        and name in ("FREQ_OFFSET_TABLE", "DATA_TABLE")
    ):
        return True, formats.juno.waves_burst_fix_table_names(data, name)
    if (
        identifiers["INSTRUMENT_ID"] == "LAMP"
        and identifiers["PRODUCT_TYPE"] == "RDR"
        and "IMAGE" in name
        and "HISTOGRAM" in name
    ):
        # multiple image objects are defined by one non-unique image object
        return True, formats.lro.lamp_rdr_histogram_image_loader(data)
    if (
        identifiers["DATA_SET_ID"] == "LRO-L-MRFLRO-5-GLOBAL-MOSAIC-V1.0"
        and "GLOBAL_S4_32PPD" in data.metaget_("PRODUCT_ID")
        and name == "IMAGE"
    ):
        # typo in one of the labels
        return True, formats.lro.mini_rf_image_loader(data, name)
    if (
        identifiers["DATA_SET_ID"] == "PVO-V-ORPA-5-ELE/ION/PHOTO/UADS-V1.0"
        and "ORPA_LOW_RES" in identifiers["PRODUCT_ID"]
        and name == "TABLE"
    ):
        return True, formats.pvo.orpa_low_res_loader(data, name)
    if (
        identifiers["DATA_SET_ID"] == "PVO-V-OIMS-4-IONDENSITY-12S-V1.0"
        and name == "TABLE"
    ):
        return True, formats.pvo.oims_12s_loader(data, name)
    if (
        "GO-E-EPD-4-SUMM-" in identifiers["DATA_SET_ID"]
        and "E1_" in identifiers["PRODUCT_ID"]
        and name == "TIME_SERIES"
    ):
        return True, formats.galileo.epd_special_block(data, name)
    if (
        identifiers["INSTRUMENT_NAME"] == "PLASMA WAVE RECEIVER"
        and "SUMM" in identifiers["DATA_SET_ID"]
        and (name == "TIME_SERIES" or name == "TABLE")
    ):
        return True, formats.galileo.pws_special_block(data, name)
    if "ULY-J-EPAC-4-SUMM" in identifiers["DATA_SET_ID"] and name == "TABLE":
        return True, formats.ulysses.get_special_block(data, name, identifiers)
    if (
        "VG2-N-MAG-4-RDR-HGCOORDS" in identifiers["DATA_SET_ID"]
        and identifiers["STANDARD_DATA_PRODUCT_ID"] == "ASCII DATA"
        and name == "TABLE"
    ):
        return True, formats.voyager.mag_special_block(data, name)
    if (
        identifiers["DATA_SET_ID"] == "VG2-SS-PLS-4-SUMM-1HR-AVG-V1.0"
        and name == "TABLE"
    ):
        return formats.voyager.pls_avg_special_block(data, name)
    if (
        identifiers["DATA_SET_ID"] == "VG2-SS-PLS-3-RDR-FINE-RES-V1.0"
        and name == "TABLE"
    ):
        return formats.voyager.pls_fine_special_block(data, name)
    if (
        identifiers["DATA_SET_ID"] == "VG2-U-PLS-5-SUMM-IONBR-48SEC-V1.0"
        and identifiers["PRODUCT_ID"] == "SUMRY.DAT"
        and name == "TIME_SERIES"
    ):
        return formats.voyager.pls_ionbr_special_block(data, name)
    if identifiers["DATA_SET_ID"] == "M9-M-IRIS-3-RDR-V1.0" and (
        name == "SPECTRAL_SERIES"  # the data product
        or "SPECTRUM" in name  # the calibration data
    ):
        return True, formats.mariner.get_special_block(data, name)
    if (
        re.match(
            r"IHW-C-MSNRDR-3-RDR-HALLEY-(ETA-AQUAR|ORIONID)-V1.0",
            identifiers["DATA_SET_ID"],
        )
        and name == "TABLE"
    ):
        return True, formats.ihw.get_special_block(data, name)
    if (
        "VG2-" in identifiers["DATA_SET_ID"]
        and "-PRA-3-RDR-LOWBAND-6SEC-V1.0" in identifiers["DATA_SET_ID"]
        and name == "TABLE"
    ):
        return formats.voyager.pra_special_block(data, name, identifiers)
    if (
        identifiers["DATA_SET_ID"] == "PHX-M-MECA-2-NIEDR-V1.0"
        and re.match(r"MECA-EM1[012]", identifiers["PRODUCT_TYPE"])
        and name == "WCHEM_TABLE"
    ):
        return True, formats.phoenix.wcl_edr_special_block(data, name)
    if (
        "MEX-M-PFS-2-EDR-" in identifiers["DATA_SET_ID"]
        and (
            "RAW" in identifiers["PRODUCT_ID"]
            or "HK" in identifiers["PRODUCT_ID"]
        )
        and name == "TABLE"
    ):
        return formats.mex.pfs_edr_special_block(data, name)
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "MARS EXPRESS"
        and identifiers["INSTRUMENT_ID"] == "MRS"
        and all(x in identifiers["PRODUCT_ID"] for x in ["ODF", "L1B", "RMP"])
        and "TABLE" in name
    ):
        return True, formats.mex.mrs_l1b_odf_rmp_redirect(data)
    if (
        identifiers["DATA_SET_ID"] == "WFF-E-ATM-1/5-V1.0"
        and name == "IMAGE"
    ):
        return formats.ground.wff_atm_special_block(data, name)
    if (
        identifiers["DATA_SET_ID"] == "CO-CAL-ISS-2-V1.0"
        and name == "IMAGE"
        and (".DA" in data.metaget_("^IMAGE")[0] 
             or identifiers["FILE_RECORDS"] == 1025)
    ):
        return formats.cassini.iss_calib_da_special_block(data, name)
    if (
        identifiers["DATA_SET_ID"] in ("CO-S-ISSNA/ISSWA-2-EDR-V1.0",
                                       "CO-E/V/J-ISSNA/ISSWA-2-EDR-V1.0",
                                       "CO-CAL-ISSNA/ISSWA-2-EDR-V1.0")
        and name in ("LINE_PREFIX_TABLE",
                     "TELEMETRY_TABLE")
    ):
        return formats.cassini.iss_edr_special_block(data, name)
    if (
        identifiers["DATA_SET_ID"] == "MGS-M-MOLA-3-PEDR-L1A-V1.0"
        and "TABLE" in name
    ):
        return True, formats.mgs.mola_pedr_special_block(data, name, identifiers)
    if (
        identifiers["DATA_SET_ID"] == "IUE-C-SWP-3-EDR-IUECDB-V1.0"
        and name == "QUALITY_IMAGE"
    ):
        return formats.iue.get_special_block(data, name)
    if (
        identifiers["SPACECRAFT_NAME"] == "GALILEO ORBITER"
        and identifiers["INSTRUMENT_NAME"] == "SOLID STATE IMAGING SYSTEM"
        and name == "LINE_PREFIX_TABLE"
    ):
        return True, formats.galileo.ssi_prefix_block(data, name)
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "MARS SCIENCE LABORATORY"
        and identifiers["INSTRUMENT_ID"] in ["MAHLI", "MAST_RIGHT",
                                             "MAST_LEFT", "MARDI"]
        and "EDR" in identifiers["DATA_SET_ID"]
        and name == "IMAGE"
    ):
        return True, formats.msl_edr.get_special_block(data, name)
    if (
            identifiers["INSTRUMENT_HOST_NAME"] == "VENUS EXPRESS"
            and identifiers["INSTRUMENT_ID"] in ["VRA"]
            and "1B_RMP_" in identifiers["PRODUCT_ID"]
    ):
        return formats.vex_vera.get_special_block(data, name)
    if (
        identifiers["DATA_SET_ID"] == 'SLN-L-GRS-5-NUCLIDE-MAP-V2.0'
        and name == "TABLE"
    ):
        return True, formats.kaguya.get_special_block_grs_table(data)
    if (
        "SLN-L-SP" in identifiers['DATA_SET_ID']
        and "LEVEL2B" in identifiers['DATA_SET_ID']
        and name == "ANCILLARY_AND_SUPPLEMENT_DATA"
    ):
        return True, formats.kaguya.get_special_block_sp_2b_supp(data, name)
    if(
        "SLN-L-LMAG-5-MA-GRID" in identifiers['DATA_SET_ID']
        and name == "TABLE"
    ):
        return True, formats.kaguya.get_special_grid_table_block()
    if(
        identifiers['DATA_SET_ID'] == "SLN-L-LMAG-5-1D-SIGMA-ECS-V1.0"
        and name == "TABLE"
    ):
        return True, formats.kaguya.get_special_1d_sigma_block()
    if(
        identifiers['DATA_SET_ID'] == "SLN-L-LMAG-3-MAG-TS-V1.0"
        and name == "TABLE"
    ):
        return True, formats.kaguya.get_special_mag_ts_block(data)
    if(
        "SLN-L-RISE-5-TRAJ-" in identifiers['DATA_SET_ID']
        and name == "TABLE"
    ):
        return True, formats.kaguya.rise_traj_special_block()

    return False, None

check_special_compressed_file_reader(identifiers: DataIdentifiers, fn: str)

Distribute to correct specialized image loader, otherwise return False/None. Preempt loaders.datawrap.ReadImage's dispatch to read_image()

Source code in pdr/formats/checkers.py
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
def check_special_compressed_file_reader(identifiers: DataIdentifiers, fn: str):
    """
    Distribute to correct specialized image loader, otherwise return
    False/None. Preempt loaders.datawrap.ReadImage's dispatch to `read_image()`
    """
    if (
            identifiers["INSTRUMENT_HOST_NAME"] == "MARS SCIENCE LABORATORY"
            and identifiers["INSTRUMENT_ID"] in ["MAHLI", "MAST_RIGHT",
                                                 "MAST_LEFT", "MARDI"]
            and "EDR" in identifiers["DATA_SET_ID"]
    ):
        return True, formats.msl_edr.msl_edr_image_loader(fn)
    if (
            identifiers["SPACECRAFT_NAME"] == "MARS_GLOBAL_SURVEYOR"
            and identifiers["INSTRUMENT_ID"] in ["MOC-NA", "MOC-WA"]
            and "IMQ" in identifiers["FILE_NAME"]
    ):
        return True, formats.mgs_moc.mgs_moc_comp_image_loader(fn, identifiers)
    return False, None

check_special_fits_start_byte(identifiers: DataIdentifiers, name: str, hdulist: HDUList) -> tuple[bool, Optional[int]]

Preempts generic PDS3 data object -> FITS start byte mapping. Wraps get_fits_start_byte().

Source code in pdr/formats/checkers.py
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
def check_special_fits_start_byte(
    identifiers: DataIdentifiers, name: str, hdulist: HDUList
) -> tuple[bool, Optional[int]]:
    """
    Preempts generic PDS3 data object -> FITS start byte mapping. Wraps
    `get_fits_start_byte()`.
    """
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "DAWN"
        and "FC2" in identifiers["DATA_SET_ID"]
        and name == "HISTORY"
    ):
        return True, formats.dawn.dawn_history_hdu_exception()
    if (
        identifiers["DATA_SET_ID"].startswith("HST-S-WFPC2-3-RPX")
        and "IMAGE" in name
    ):
        return True, formats.saturn_rpx.rpx_img_hdu_start_byte(name, hdulist)
    if (
        identifiers["INSTRUMENT_ID"] == "HRIV"
        and identifiers["PRODUCT_TYPE"] == "RADIANCE_DECONVOLVED"
        and name.startswith("EXT_MASK")
    ):
        return True, formats.epoxi.hriv_deconv_mask_start_byte(name, hdulist)
    if identifiers["DATA_SET_ID"].startswith("MSGR-H-MDIS-6-CAL"):
        return True, formats.galileo.mdis_fits_start_byte(name, hdulist)
    if identifiers["DATA_SET_ID"] == "MSSSO-J-CASPIR-3-RDR-SL9-STDS-V1.0":
        return True, formats.ground.mssso_cal_start_byte(name, hdulist)
    if "MEX-M-VMC-3-RDR" in identifiers["DATA_SET_ID"]:
        return True, formats.mex.vmc_rdr_hdu_selection(name, hdulist)
    if (
        identifiers["DATA_SET_ID"] == "LRO-L-LAMP-2-EDR-V1.0" 
        and "TABLE" in name
    ):
        return formats.lro.lamp_edr_hdu_exceptions(name, hdulist)
    if (
        identifiers["DATA_SET_ID"] == "LRO-L-LAMP-3-RDR-V1.0" 
    ):
        return formats.lro.lamp_rdr_hdu_start_byte(name, hdulist)
    if (
        identifiers['PRODUCT_TYPE'] == "RDR"
        and "JNO-J-UVS-3-RDR-" in identifiers['DATA_SET_ID']
    ):
        return True, formats.juno.uvs_rdr_start_byte(name, hdulist)
    if (
        identifiers['PRODUCT_TYPE'] == "EDR"
        and "JNO-J-UVS-2-EDR-" in identifiers['DATA_SET_ID']
    ):
        return True, formats.juno.uvs_edr_start_byte(name, hdulist)
    return False, None

check_special_fn(data: PDRLike, object_name: str, identifiers: DataIdentifiers) -> tuple[bool, Optional[str]]

Preempts generic filename specification. Called inline by Data._object_to_filename().

Source code in pdr/formats/checkers.py
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
def check_special_fn(
    data: PDRLike, object_name: str, identifiers: DataIdentifiers
) -> tuple[bool, Optional[str]]:
    """
    Preempts generic filename specification. Called inline by
    `Data._object_to_filename()`.
    """
    if (identifiers["DATA_SET_ID"] == "CLEM1-L-RSS-5-BSR-V1.0") and (
        object_name in ("HEADER_TABLE", "DATA_TABLE")
    ):
        # sequence wrapped as string for object names
        return formats.clementine.get_fn(data, object_name)
    if (
        identifiers["SPACECRAFT_NAME"] == "MAGELLAN"
        and (data.filename.endswith(".img") or data.filename.endswith("ibg"))
        and object_name == "TABLE"
    ):
        return formats.mgn.get_fn(data)
    # filenames are frequently misspecified
    if identifiers["DATA_SET_ID"].startswith("CO-D-CDA") and (
        object_name == "TABLE"
    ):
        return formats.cassini.cda_table_filename(data)
    # THEMIS labels don't always mention when a file is stored gzipped
    if identifiers["INSTRUMENT_ID"] == "THEMIS":
        return formats.themis.check_gzip_fn(data, object_name)
    if (
        identifiers["DATA_SET_ID"]
        in (
            "NH-P-PEPSSI-4-PLASMA-V1.0",
            "NH-X-SWAP-5-DERIVED-SOLARWIND-V1.0",
            "NH-P/PSA-LORRI/ALICE/REX-5-ATMOS-V1.0",
        )
        and object_name == "SPREADSHEET"
    ):
        return formats.nh.get_fn(data)
    if (
        identifiers["SPACECRAFT_NAME"] == "GALILEO ORBITER"
        and identifiers["INSTRUMENT_NAME"] == "SOLID_STATE_IMAGING"
        and object_name == "IMAGE_LINE_PREFIX_TABLE"
    ):
        return formats.galileo.ssi_redr_prefix_fn(data)
    if (
        identifiers["DATA_SET_ID"] == "VG2-SR/UR/NR-PPS-2/4-OCC-V1.0"
        and identifiers["PRODUCT_TYPE"] == "JITTER"
        and object_name == "SERIES"
    ):
        return formats.voyager.get_fn(data)
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "MARS SCIENCE LABORATORY"
        and identifiers["INSTRUMENT_ID"] in ["MAHLI", "MAST_RIGHT",
                                             "MAST_LEFT", "MARDI"]
        and "EDR" in identifiers["DATA_SET_ID"] and object_name == "IMAGE"
    ):
        return formats.msl_edr.msl_msss_edr_prefix_fn(data)
    return False, None

check_special_label(fn: Union[str, Path])

Used primarily to check for labels with known characters invalid in utf-8. We then read the label with a more correct or lenient encoding. Preempt loaders.datawrap.ReadLabel's dispatch to read_label(). Also used in read_pvl().

Source code in pdr/formats/checkers.py
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
def check_special_label(fn: Union[str, Path]):
    """
    Used primarily to check for labels with known characters invalid in utf-8.
    We then read the label with a more correct or lenient encoding. Preempt
    loaders.datawrap.ReadLabel's dispatch to `read_label()`. Also used in
    `read_pvl()`.
    """
    if (
            any(tag in fn for tag in ['CE1', 'CE2', 'CE3'])
            and any(sfx in fn for sfx in ['.1A', '.1B', '.1C',
                                                '.2A', '.2B', '.2C',
                                                '.3A', '.3B', '.3C',
                                                '.01', '.02', '.03'])
    ):
        return True, formats.change.special_label(fn)
    if (
            ".TAB" in fn and "_DOY" in fn and
            ("MAG_" in fn or "BIO_" in fn) and "_V1" in fn
    ):
        return True, formats.vex_mag.special_label(fn)
    return False, None

check_special_objects(identifiers: DataIdentifiers)

Check to add objects not correctly ID'd as objects in a label, or remove objects ID'd in a label. Called inline by _find_objects().

Source code in pdr/formats/checkers.py
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
def check_special_objects(identifiers: DataIdentifiers):
    """
    Check to add objects not correctly ID'd as objects in a label, or remove
    objects ID'd in a label. Called inline by `_find_objects()`.
    """
    if (
            identifiers["INSTRUMENT_HOST_NAME"] == "MARS SCIENCE LABORATORY"
            and identifiers["INSTRUMENT_ID"] in ["MAHLI", "MAST_RIGHT",
                                                 "MAST_LEFT", "MARDI"]
            and "EDR" in identifiers["DATA_SET_ID"]
    ):
        # a consequence of this is the geometry file and miniheader objects
        # denoted in the label for the edrs
        return True, ['IMAGE', 'MODEL_DESC']
    if (
            identifiers["SPACECRAFT_NAME"] == "Chang'E-3 Rover"
            and "CE3_BMYK_VNIS-CC_SCI_N" in identifiers["PRODUCT_ID"]
            and ".2A" in identifiers["PRODUCT_ID"]
    ):
        # this would need to be modified if you could find the cal target data
        # Excel file and removed the bad characters from the label around it
        return True, ['LABEL', 'IMAGE_PREFIX', 'IMAGE']
    if (
            identifiers["SPACECRAFT_NAME"] == "Chang'E-3 Rover"
            and "CE3_BMYK_VNIS-SC_SCI_N_" in identifiers["PRODUCT_ID"]
            and ".2A" in identifiers["PRODUCT_ID"]
    ):
        # this would need to be modified if you could find the cal target data
        # Excel file and removed the bad characters from the label around it
        return True, ['LABEL', 'TABLE']
    if (
            identifiers["SPACECRAFT_NAME"] == "Chang'E-3 Rover"
            and "CE3_BMYK_VNIS-SD_SCI_N_" in identifiers["PRODUCT_ID"]
            and ".2A" in identifiers["PRODUCT_ID"]
    ):
        # this would need to be modified if you could find the cal target data
        # Excel file and removed the bad characters from the label around it
        return True, ['LABEL', 'TABLE']
    return False, None

check_special_offset(name: str, data: PDRLike, identifiers: DataIdentifiers, fn: str) -> tuple[bool, Optional[int]]

Preempt generic inference of an object's byte offset within a file. Wraps loaders.queries.data_start_byte().

Source code in pdr/formats/checkers.py
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
def check_special_offset(
    name: str, data: PDRLike, identifiers: DataIdentifiers, fn: str
) -> tuple[bool, Optional[int]]:
    """
    Preempt generic inference of an object's byte offset within a file. Wraps
    `loaders.queries.data_start_byte()`.
    """
    # these incorrectly specify object length rather than
    # object offset in the ^HISTOGRAM pointer target
    if identifiers["INSTRUMENT_ID"] == "CHEMIN":
        return formats.msl_cmn.get_offset(name)
    if (
        identifiers["DATA_SET_ID"] == "CLEM1-L-RSS-5-BSR-V1.0"
        and re.match("(HEADER|DATA)_TABLE", name)
    ):
        # sequence wrapped as string for object names
        return formats.clementine.get_offset(data, name)
    if identifiers["INSTRUMENT_ID"] == "THEMIS" and name == "QUBE":
        return formats.themis.get_qube_offset(data)
    disrsubs = re.compile(r"STRIP|VISIBL|IMAGE|IR_|TIME|SUN|SOLAR")
    if (
        identifiers["INSTRUMENT_NAME"] == "DESCENT IMAGER SPECTRAL RADIOMETER"
        and identifiers["PRODUCT_TYPE"] == "RDR"
        or disrsubs.search(identifiers["FILE_NAME"])
    ):
        return formats.cassini.get_offset(fn, identifiers)
    if (
        identifiers["DATA_SET_ID"] == "CO-E/V/J-ISSNA/ISSWA-2-EDR-V1.0"
        and 1359362956 <= float(data.metaget_("SPACECRAFT_CLOCK_STOP_COUNT"))
        and float(data.metaget_("SPACECRAFT_CLOCK_STOP_COUNT")) <= 1363539029
    ):
        return formats.cassini.coiss_1006_offset(data, name, identifiers)
    if (
        identifiers["INSTRUMENT_ID"] == "CRAT"
        and identifiers["PRODUCT_TYPE"] == "EDR"
        and name == "TABLE_1"
    ):
        return formats.lro.get_crater_offset()
    if (
        identifiers["DATA_SET_ID"] == "PHX-M-SSI-5-ATMOS-OPACITY-V1.0"
        and "TABLE" in name
    ):
        return formats.phoenix.phxao_table_offset(fn, identifiers)
    if (
        identifiers["DATA_SET_ID"] == "PHX-M-MECA-4-NIRDR-V1.0"
        and identifiers["PRODUCT_TYPE"] in ("MECA_WCL_CP", "MECA_WCL_CV")
        and "TABLE" in name
    ):
        return formats.phoenix.wcl_rdr_offset(data, name)
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "MARS SCIENCE LABORATORY"
        and identifiers["INSTRUMENT_ID"] == "REMS"
        and name == "REMS_REPORT_TABLE"
        and ("HSDEF__" in identifiers["PRODUCT_ID"] or 
             "HSREG__" in identifiers["PRODUCT_ID"])
    ):
        return formats.msl_rems.edr_offset(data, name)
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "MARS SCIENCE LABORATORY"
        and identifiers["INSTRUMENT_ID"] in ["MAHLI", "MAST_RIGHT",
                                             "MAST_LEFT", "MARDI"]
        and "EDR" in identifiers["DATA_SET_ID"]
    ):
        return formats.msl_edr.edr_offset(data, name)
    return False, None

check_special_pds4_cases(structure, filename, object_name)

Load objects from PDS4 files with known issues that do not currently work with pds4_tools. Mostly utilized by datasets not verified by the PDS but that have PDS4 labels (ISRO, ESA, CNSA etc).

Source code in pdr/formats/checkers.py
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
def check_special_pds4_cases(structure, filename, object_name):
    """
    Load objects from PDS4 files with known issues that do not currently work
    with pds4_tools. Mostly utilized by datasets not verified by the PDS but
    that have PDS4 labels (ISRO, ESA, CNSA etc).
    """
    if (
            "CE6-L_GRAS_LMS-M" in filename and "SCI" in filename
            and ".2B" in filename and object_name == "TABLE_0"
    ):
        return formats.change.read_change_fw_table(structure, filename)
    if (
            "CE6-L_GRAS_LMS-S" in filename and "SCI" in filename
            and ".2B" in filename and object_name == "TABLE_0"
    ):
        return formats.change.read_change_fw_table(structure, filename)
    if (
            "CE6-L_GRAS_LMS-N" in filename and "SCI" in filename
            and ".2B" in filename and object_name == "TABLE_0"
    ):
        return formats.change.read_change_fw_table(structure, filename)
    if (
            "CE5-L_GRAS_LMS-N" in filename and ".2B" in filename
            and "SCI" in filename and object_name == "TABLE_0"
    ):
        return formats.change.read_change_fw_table(structure, filename)
    if (
            "CE5-L_GRAS_LMS-M" in filename and ".2B" in filename
            and "SCI" in filename and object_name == "TABLE_0"
    ):
        return formats.change.read_change_fw_table(structure, filename)
    if (
            "CE4_GRAS_ASAN-SCI_SCI" in filename and ".2B" in filename
            and object_name == "TABLE_0"
    ):
        return formats.change.read_table_using_spaces(structure, filename)
    if (
            "CE4_GRAS_LND-DPSL_SCI" in filename and ".2A" in filename
            and object_name == "TABLE_0"
    ):
        return formats.change.read_table_using_spaces(structure, filename)
    if (
            "CE4_GRAS_LND-ThN_SCI" in filename and ".2A" in filename
            and object_name == "TABLE_0"
    ):
        return formats.change.read_table_using_spaces(structure, filename)
    if (
            "CE4_GRAS_LND-TID_SCI" in filename and ".2A" in filename
            and object_name == "TABLE_0"
    ):
        return formats.change.read_table_using_spaces(structure, filename)
    # if (
    #         "CE4_GRAS_VNIS-VD_SCI" in filename and ".2B" in filename
    #         and object_name == "TABLE_0"
    # ):
    # this is a fixed width table, but it's not all UTF-8 so PD can't handle it
    #    return formats.change.read_table_using_spaces(structure, filename)
    if (
            "CE4_GRAS_VNIS-SD_SCI" in filename and ".2B" in filename
            and object_name == "TABLE_0"
    ):
        return formats.change.read_change_fw_table(structure, filename)
    if (
            "ch2_cla_l1_" in filename
            and (object_name == "header_Data" or object_name == "data")
    ):
        return formats.ch2_isro.read_class_fits_table(filename, object_name)
    return None

check_special_position(identifiers: DataIdentifiers, block: MultiDict, target: PhysicalTarget, name: str, fn: str, start_byte: int) -> tuple[bool, Optional[int]]

Preempt generic detection of a table's row or byte offset within a file. Wraps table_position(). Used for table-specific cases that are partially but not wholly handled by data_start_byte(), so should not be defined in check_special_offset().

Source code in pdr/formats/checkers.py
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
def check_special_position(
    identifiers: DataIdentifiers,
    block: MultiDict,
    target: PhysicalTarget,
    name: str,
    fn: str,
    start_byte: int,
) -> tuple[bool, Optional[int]]:
    """
    Preempt generic detection of a table's row or byte offset within a file.
    Wraps `table_position()`. Used for table-specific cases that are partially
    but not wholly handled by `data_start_byte()`, so should not be defined
    in `check_special_offset()`.
    """
    if (
        identifiers["INSTRUMENT_ID"] == "MARSIS"
        and " TEC " in identifiers["DATA_SET_NAME"]
    ):
        return True, formats.mex.marsis_get_position(
            identifiers, block, target, name, start_byte
        )
    huysubs = re.compile(r"DARK|STRIP|VIS_EX|SUN|VISIBL|TIME|SOLAR|IMAGE")
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "HUYGENS PROBE"
        and huysubs.search(identifiers["FILE_NAME"])
        or (
            identifiers["INSTRUMENT_NAME"]
            == "DESCENT IMAGER SPECTRAL RADIOMETER"
            and identifiers["PRODUCT_TYPE"] == "RDR"
        )
        and name in ("TABLE", "HEADER")
    ):
        return True, formats.cassini.get_position(
            identifiers, block, target, name, fn, start_byte
        )
    if (
        "CO-V/E/J/S/SS-RPWS-" in identifiers["DATA_SET_ID"]
        and identifiers["PRODUCT_TYPE"] == "ANCILLARY"
        and name == "RPWS_TIME_ORDERED_TABLE"
    ):
        return True, formats.cassini.rpws_ancil_position(
            identifiers, block, target, name, start_byte
        )
    if (
        identifiers["DATA_SET_ID"] == "LRO-L-RSS-1-TRACKING-V1.0"
        and name == "WEAREC_TABLE"
    ):
        return formats.lro.rss_get_position(
            identifiers, block, target, name, start_byte
        )
    if (
        identifiers["DATA_SET_ID"] == "DIF-C-HRIV/MRI-5-HARTLEY2-SHAPE-V1.0"
        and identifiers["PRODUCT_ID"] == "HARTLEY2-CARTESIAN-PLATE-MODEL"
        and "TABLE" in name
    ):
        return True, formats.epoxi.cart_model_get_position(
            identifiers, block, target, name, start_byte
        )
    if (
        "MEX-M-MRS-5-OCC" in identifiers["DATA_SET_ID"]
        and name == "ATM_TABLE"
    ):
        return True, formats.mex.mrs_ddr_atmo_position(
            identifiers, block, target, name, start_byte
        )
    if (
        identifiers["DATA_SET_ID"] == "PHX-M-SSI-5-ATMOS-OPACITY-V1.0"
        and "HEADER" in name
    ):
        return True, formats.phoenix.phxao_header_position(
            identifiers, block, target, name, start_byte
        )
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "MARS EXPRESS"
        and identifiers["INSTRUMENT_ID"] == "MRS"
        and all(x in identifiers["PRODUCT_ID"] for x in ["ICL", "L1B"])
        and name == "DOPPLER_TABLE"
    ):
        return True, formats.mex.mrs_get_position(
            identifiers, block, target, name, start_byte
        )
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "MARS EXPRESS"
        and identifiers["INSTRUMENT_ID"] == "MRS"
        and all(x in identifiers["PRODUCT_ID"] for x in ["ODF", "L02"])
        and name == "RANGING_TABLE"
    ):
        return True, formats.mex.mrs_get_position(
            identifiers, block, target, name, start_byte
        )
    if (
        identifiers["DATA_SET_ID"] in ("ESO1M-SR-APPH-4-OCC-V1.0",
                                       "ESO22M-SR-APPH-4-OCC-V1.0",
                                       "IRTF-SR-URAC-4-OCC-V1.0",
                                       "PAL200-SR-CIRC-4-OCC-V1.0",
                                       "MCD27M-SR-IIRAR-4-OCC-V1.0")
        and "GEOM" in identifiers["PRODUCT_ID"]
        and name == "SERIES"
    ):
        return True, formats.ground.ebrocc_geom_get_position(
            identifiers, block, target, name, start_byte
        )
    if (
        identifiers["DATA_SET_ID"] == "MRO-M-CRISM-5-RDR-MULTISPECTRAL-V1.0"
        and "MRRWV" in identifiers["PRODUCT_ID"]
        and name == "TABLE"
    ):
        return True, formats.mro.crism_mrdr_ancill_position(
            identifiers, block, target, name, start_byte
        )
    if (
        identifiers["DATA_SET_ID"] == "MSX-L-SPIRIT3-2/4-V1.0"
        and name == "ENVI_HEADER"
    ):
        return True, formats.msx.cube_envi_header_position(
            identifiers, block, target, name, start_byte, fn
        )

    if (
            identifiers["INSTRUMENT_HOST_NAME"] == "VENUS EXPRESS"
            and identifiers["INSTRUMENT_ID"] in ["VRA"]
            and identifiers['PRODUCT_TYPE'] == "UDR"
    ):
        return formats.vex_vera.udr_table_special_position()
    return False, None

check_special_qube_band_storage(identifiers: DataIdentifiers)

Defines band storage types for QUBE procuts whose labels do not correctly specify them. Wraps get_qube_band_storage_type().

Source code in pdr/formats/checkers.py
1216
1217
1218
1219
1220
1221
1222
1223
def check_special_qube_band_storage(identifiers: DataIdentifiers):
    """
    Defines band storage types for QUBE procuts whose labels do not correctly
    specify them. Wraps `get_qube_band_storage_type()`.
    """
    if identifiers["INSTRUMENT_HOST_NAME"] == "CASSINI_ORBITER":
        return formats.cassini.get_special_qube_band_storage()
    return False, None

check_special_sample_type(identifiers: DataIdentifiers, base_samp_info: dict) -> tuple[bool, Optional[str]]

Preempt generic mapping of PDS3 data types to numpy dtype strings. Wraps image_sample_type(); called inline by insert_sample_types_into_df().

Source code in pdr/formats/checkers.py
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
def check_special_sample_type(
    identifiers: DataIdentifiers,
    base_samp_info: dict,
) -> tuple[bool, Optional[str]]:
    """
    Preempt generic mapping of PDS3 data types to numpy dtype strings. Wraps
    `image_sample_type()`; called inline by `insert_sample_types_into_df()`.
    """
    if (
        identifiers["DATA_SET_ID"] == "JNO-J-JIRAM-3-RDR-V1.0"
        and identifiers.get("PRODUCT_TYPE", "") == "RDR"
    ):
        return True, formats.juno.jiram_rdr_sample_type()
    if (
        identifiers["INSTRUMENT_ID"] == "LROC"
        and identifiers["PRODUCT_TYPE"] == "EDR"
    ):
        # unsigned integers not specified as such
        return True, formats.lroc.lroc_edr_sample_type()
    if (
        identifiers["DATA_SET_ID"] == "MGN-V-RDRS-5-GVDR-V1.0"
        and "GVANF" in identifiers["PRODUCT_ID"]
        and "N/A" in base_samp_info["SAMPLE_TYPE"]
    ):
        return True, formats.mgn.gvanf_sample_type()
    if identifiers["DATA_SET_ID"] == "LRO-L-CRAT-2-EDR-RAWDATA-V1.0":
        return formats.lro.crater_bit_col_sample_type(base_samp_info)
    if (
        identifiers["SPACECRAFT_NAME"] == "GALILEO_ORBITER"
        and "-NIMS-2-EDR-V1.0" in identifiers["DATA_SET_ID"]
    ):
        return formats.galileo.nims_edr_sample_type(base_samp_info)
    if (
        identifiers["DATA_SET_ID"] == "ULY-J-EPAC-4-SUMM-PHA-24HR-V1.0"
        and identifiers["PRODUCT_ID"].endswith("BIN")
    ):
        return formats.ulysses.get_sample_type(base_samp_info)
    if re.match(
        r"CO-(CAL-ISS|[S/EVJ-]+ISSNA/ISSWA-2)", identifiers["DATA_SET_ID"]
    ):
        return formats.cassini.line_prefix_sample_type(base_samp_info)
    return False, None

check_special_structure(name: str, block: MultiDict, fn: str, data: PDRLike, identifiers: DataIdentifiers) -> tuple[bool, Optional[tuple[pd.DataFrame, Optional[np.dtype]]]]

Preempt generic ARRAY/TABLE/SPREADSHEET format definition parsing. Wraps parse_array_structure() and parse_table_structure().

Source code in pdr/formats/checkers.py
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
def check_special_structure(
    name: str,
    block: MultiDict,
    fn: str,
    data: PDRLike,
    identifiers: DataIdentifiers
) -> tuple[bool, Optional[tuple[pd.DataFrame, Optional[np.dtype]]]]:
    """
    Preempt generic ARRAY/TABLE/SPREADSHEET format definition parsing. Wraps
    `parse_array_structure()` and `parse_table_structure()`.
    """
    if (
        identifiers["DATA_SET_ID"] == "CLEM1-L-RSS-5-BSR-V1.0"
        and name == "DATA_TABLE"
    ):
        # sequence wrapped as string for object names
        return True, formats.clementine.get_structure(
            block, name, fn, data, identifiers
        )
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "MARS GLOBAL SURVEYOR"
        and identifiers["INSTRUMENT_ID"] == "RSS"
        and identifiers["PRODUCT_TYPE"] == "ODF"
        and name == "ODF3B_TABLE"
    ):
        return True, formats.mgs.get_odf_structure(
            block, name, fn, data, identifiers
        )

    if (
        identifiers.get("INSTRUMENT_HOST_NAME") == "MARS GLOBAL SURVEYOR"
        and identifiers.get("INSTRUMENT_NAME") == "RADIO SCIENCE SUBSYSTEM"
        and identifiers.get("PRODUCT_TYPE") == "ECS"
    ):
        return True, formats.mgs.get_ecs_structure(
            block, name, fn, data, identifiers
        )
    # TODO: yikes
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "CASSINI ORBITER"
        and identifiers["INSTRUMENT_ID"] == "RPWS"
        and name == "TIME_SERIES"
    ) or (
        identifiers["INSTRUMENT_HOST_NAME"] == "HUYGENS PROBE"
        and (
            "HUY_DTWG_ENTRY_AERO" in fn
            or (
                "HASI" in data.metaget_("FILE_NAME", "")
                and "PWA" not in identifiers["FILE_NAME"]
            )
        )
    ):
        return True, formats.cassini.get_structure(
            block, name, fn, data, identifiers
        )
    if (
        re.match(
            r"GP-J-(NMS|ASI)-3-ENTRY-V1.0", identifiers["DATA_SET_ID"]
        )
        and name == "TABLE"
    ):
        return True, formats.galileo.probe_structure(
            block, name, fn, data, identifiers
        )
    if (
        identifiers["DATA_SET_ID"] == "GO-E-EPD-2-SAMP-PAD-V1.0"
        and identifiers["PRODUCT_ID"] == "E1PAD_7.TAB"
        and name == "TIME_SERIES"
    ):
        return True, formats.galileo.epd_structure(
            block, name, fn, data, identifiers
        )
    if (
        "VEGA" in identifiers["DATA_SET_ID"]
        and "-C-DUCMA-3-RDR-HALLEY-V1.0" in identifiers["DATA_SET_ID"]
        and name == "TABLE"
    ):
        return True, formats.vega.get_structure(
            block, name, fn, data, identifiers
        )
    if (
        "GIO-C-PIA-3-RDR-HALLEY-V1.0" == identifiers["DATA_SET_ID"]
        or re.match(r"VEGA.-C-PUMA.*", identifiers["DATA_SET_ID"])
    ) and name == "ARRAY":
        return True, formats.vega.fix_array_structure(
            name, block, fn, data, identifiers
        )
    if (
        re.match(r"MRO-M-MCS-(4-RDR|2-EDR)-V1.0", identifiers["DATA_SET_ID"])
        and name == "TABLE"
    ):
        return True, formats.mro.get_structure(
            block, name, fn, data, identifiers
        )
    if (
        identifiers["DATA_SET_ID"] == "VG2-SS-PLS-4-SUMM-1HR-AVG-V1.0"
        and name == "TABLE"
        and block["^STRUCTURE"] == "VGR_PLS_HR_2017.FMT"
    ):
        return True, formats.voyager.get_structure(
            block, name, fn, data, identifiers
        )
    if "IHW-C-SPEC-" in identifiers["DATA_SET_ID"] and name == "SPECTRUM":
        return True, formats.ihw.get_structure(
            block, name, fn, data, identifiers
        )
    if (
        identifiers["DATA_SET_ID"] == "PHX-M-MECA-2-NIEDR-V1.0"
        and name == "TBL_TABLE"
        and block["CONTAINER"]["^STRUCTURE"] == "TBL_0_STATE_DATA.FMT"
    ):
        return True, formats.phoenix.elec_em6_structure(
            block, name, fn, data, identifiers
        )
    if (
        identifiers["DATA_SET_ID"] == "PHX-M-MECA-4-NIRDR-V1.0"
        and identifiers["INSTRUMENT_ID"] == "MECA_AFM"
        and "HEADER_TABLE" in name
    ):
        return True, formats.phoenix.afm_rdr_structure(
            block, name, fn, data, identifiers
        )
    if (
        identifiers["DATA_SET_ID"] == "PHX-M-TEGA-2-LEDEDR-V1.0"
        and name == "TIME_SERIES"
        and block["^STRUCTURE"] == "TEGA_LED.FMT"
    ):
        return True, formats.phoenix.led_edr_structure(
            block, name, fn, data, identifiers
        )
    if (
        identifiers["DATA_SET_ID"] == "PHX-M-TEGA-4-SCRDR-V1.0"
        and name == "TIME_SERIES"
        and block["^STRUCTURE"] == "TEGA_SCRDR.FMT"
    ):
        return True, formats.phoenix.sc_rdr_structure(
            block, name, fn, data, identifiers
        )
    if (
        identifiers["DATA_SET_ID"] == "MEX-SUN-ASPERA3-4-SWM-V1.0"
        and name == "TABLE"
    ):
        return True, formats.mex.aspera_ima_ddr_structure(
            block, name, fn, data, identifiers
        )
    if (
        "-MIDAS-3-" in identifiers["DATA_SET_ID"]
        and "SPS" in identifiers["PRODUCT_ID"]
        and name == "TIME_SERIES"
    ):
        return True, formats.rosetta.midas_rdr_sps_structure(
            block, name, fn, data, identifiers
        )
    if (
        "-MIDAS-3-" in identifiers["DATA_SET_ID"]
        and "FSC" in identifiers["PRODUCT_ID"]
        and name == "FREQUENCY_SERIES"
    ):
        return True, formats.rosetta.fix_pad_length_structure(
            block, name, fn, data, identifiers
        )
    if (
        "ROSETTA" in identifiers["DATA_SET_NAME"] 
        and "CONSERT" in identifiers["DATA_SET_NAME"]
        and "TABLE" in name
        and name.startswith(("I_", "Q_"))
    ):
        if "GRNDBENCH" in identifiers["DATA_SET_ID"]:
            return False, None
        return True, formats.rosetta.fix_pad_length_structure(
            block, name, fn, data, identifiers
        )
    if (
        identifiers["SPACECRAFT_NAME"] == "GALILEO ORBITER"
        and identifiers["INSTRUMENT_NAME"] in ("SOLID_STATE_IMAGING", 
                                               "SOLID STATE IMAGING SYSTEM")
    ):
        return formats.galileo.ssi_redr_structure(
            block, name, fn, data, identifiers
        )
    if (
        identifiers["DATA_SET_ID"] == "ODY-M-GRS-2-EDR-V1.0"
        and identifiers["PRODUCT_TYPE"] == "E_KERNEL"
        and ".txt" in fn.lower()
    ):
        return formats.odyssey.grs_e_kernel_structure()
    if (
            identifiers["INSTRUMENT_HOST_NAME"] == "VENUS EXPRESS"
            and identifiers["INSTRUMENT_ID"] in ["VRA"]
            and identifiers['PRODUCT_TYPE'] == "UDR"
            and "ODF" not in fn
            and "TNF" not in fn
            and "RSR" not in fn
    ):
        return formats.vex_vera.udr_table_structure()

    return False, None

check_special_table_reader(identifiers: DataIdentifiers, name: str, fn: str, fmtdef_dt: tuple[pd.DataFrame, np.dtype], block: MultiDict, start_byte: int)

Preempt loaders.datawrap.ReadTable's dispatch to read_table().

Source code in pdr/formats/checkers.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
def check_special_table_reader(
    identifiers: DataIdentifiers,
    name: str,
    fn: str,
    fmtdef_dt: tuple[pd.DataFrame, np.dtype],
    block: MultiDict,
    start_byte: int,
):
    """Preempt loaders.datawrap.ReadTable's dispatch to `read_table()`."""
    if identifiers["DATA_SET_ID"] in (
        "CO-S-MIMI-4-CHEMS-CALIB-V1.0",
        "CO-S-MIMI-4-LEMMS-CALIB-V1.0",
        "CO-S-MIMI-4-INCA-CALIB-V1.0",
        "CO-E/J/S/SW-MIMI-2-LEMMS-UNCALIB-V1.0",
        "CO-SSA-RADAR-3-ABDR-SUMMARY-V1.0",
    ):
        return True, formats.cassini.spreadsheet_loader(
            fn, fmtdef_dt, identifiers["DATA_SET_ID"]
        )
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "CASSINI ORBITER"
        and identifiers["PRODUCT_TYPE"] == "ANCILLARY"
        and "CO-S-MIMI-4-" in identifiers["DATA_SET_ID"]
    ):
        return True, formats.cassini.spreadsheet_loader(
            fn, fmtdef_dt, identifiers["DATA_SET_ID"]
        )
    if identifiers["INSTRUMENT_ID"] == "CHEMIN" and ("SPREADSHEET" in name):
        # mangled object names + positions
        return True, formats.msl_cmn.spreadsheet_loader(fn)
    if (
        "MSL-M-SAM-" in identifiers["DATA_SET_ID"]
        and "QMS" in identifiers["PRODUCT_ID"]
        and "TABLE" in name
    ):
        # reusing the msl_cmn special case for msl_sam qms tables
        return True, formats.msl_cmn.spreadsheet_loader(fn)
    if (
        identifiers["DATA_SET_ID"] == "MSL-M-ROVER-6-RDR-PLACES-V1.0"
        and name == "SPREADSHEET"
    ):
        return True, formats.msl_places.spreadsheet_loader(fn, fmtdef_dt)
    if (
        identifiers["INSTRUMENT_NAME"]
        == "ROSETTA PLASMA CONSORTIUM - MUTUAL IMPEDANCE "
        "PROBE"
        and "SPECTRUM_TABLE" in name
    ):
        return True, formats.rosetta.rosetta_table_loader(fn, fmtdef_dt)
    if (
        identifiers["SPACECRAFT_NAME"] == "MAGELLAN"
        and name == "TABLE"
        and identifiers["NOTE"].startswith("Geometry")
    ) or (
        identifiers["DATA_SET_ID"] == "GO-J-NIMS-4-ADR-SL9IMPACT-V1.0"
        and name == "TABLE"
        and (
            "CAL_DATA.TAB" in identifiers["PRODUCT_ID"]
            or "G_DATA.TAB" in identifiers["PRODUCT_ID"]
            or "R_DATA.TAB" in identifiers["PRODUCT_ID"]
        )
    ):
        return True, formats.mgn.geom_table_loader(fn, fmtdef_dt)
    if (
        identifiers["DATA_SET_ID"].startswith("MGN-V-RSS-5-OCC-PROF")
        and name == "TABLE"
    ):
        return True, formats.mgn.occultation_loader(
            identifiers, fmtdef_dt, block, fn
        )
    if (
        identifiers["INSTRUMENT_ID"] == "DLRE"
        and identifiers["PRODUCT_TYPE"] in ("GCP", "PCP", "PRP")
        and name == "TABLE"
    ):
        return True, formats.diviner.diviner_l4_table_loader(fmtdef_dt, fn)
    if (
        identifiers["DATA_SET_ID"] == "GO-J-PWS-5-DDR-PLASMA-DENSITY-FULL-V1.0"
        and name == "SPREADSHEET"
    ):
        return True, formats.galileo.pws_table_loader(fn, fmtdef_dt)
    if (
        identifiers["DATA_SET_ID"] == "ODY-M-GRS-5-ELEMENTS-V1.0"
        and name == "TABLE"
    ):
        return True, formats.odyssey.map_table_loader(fn, fmtdef_dt)
    if (
        identifiers["DATA_SET_ID"] == "ULY-J-GAS-5-SKY-MAPS-V1.0"
        and name == "TABLE"
        and block["^STRUCTURE"] == "GASDATA.FMT"
    ):
        return True, formats.ulysses.gas_table_loader(fn, fmtdef_dt)
    if (
        "MRO-M-MCS-5-DDR" in identifiers["DATA_SET_ID"]
        and "V1.0" not in identifiers["DATA_SET_ID"]
        and name == "TABLE"
    ):
        return True, formats.mro.mcs_ddr_table_loader(
            block, fn, start_byte
        )
    if (
        identifiers["INSTRUMENT_ID"] == "CRISM"
        and identifiers["PRODUCT_TYPE"] == "ANCILLARY"
        and "OBS" in identifiers["PRODUCT_ID"]
        and name == "TABLE"
    ):
        return True, formats.mro.ancil_table_loader(fn, fmtdef_dt)
    if (
        identifiers["DATA_SET_ID"] == "IHW-C-IRFCURV-3-EDR-HALLEY-V2.0"
        and name == "TABLE"
    ):
        return True, formats.ihw.curve_table_loader(fn, fmtdef_dt)
    if (
        identifiers["DATA_SET_ID"]
        in (
            "IHW-C-PPFLX-3-RDR-HALLEY-V1.0",
            "IHW-C-PPOL-3-RDR-HALLEY-V1.0",
            "IHW-C-PPSTOKE-3-RDR-HALLEY-V1.0",
            "IHW-C-PPMAG-3-RDR-HALLEY-V1.0",
            "IHW-C-MSNRDR-3-RDR-HALLEY-ETA-AQUAR-V1.0",
            "IHW-C-MSNRDR-3-RDR-HALLEY-ORIONID-V1.0",
            "IHW-C-MSNVIS-3-RDR-HALLEY-ETA-AQUAR-V1.0",
            "IHW-C-MSNVIS-3-RDR-HALLEY-ORIONID-V1.0",
            "IHW-C-IRFTAB-3-RDR-HALLEY-V1.0",
            "IHW-C-IRPOL-3-RDR-HALLEY-V1.0",
            "IHW-C-IRPHOT-3-RDR-HALLEY-V1.0",
        )
        and name == "TABLE"
    ):
        return True, formats.ihw.add_newlines_table_loader(
            fmtdef_dt, block, fn, start_byte
        )
    if (
        identifiers["DATA_SET_ID"] == "VG1-J-LECP-4-SUMM-SECTOR-15MIN-V1.1"
        and name == "TABLE"
    ):
        return True, formats.voyager.lecp_table_loader(fn, fmtdef_dt)
    if (
        identifiers["DATA_SET_ID"] == "VG2-U-LECP-4-RDR-STEP-12.8MIN-V1.0"
        and block["INTERCHANGE_FORMAT"] == "ASCII"
        and name == "TABLE"
    ):
        return True, formats.voyager.lecp_table_loader(fn, fmtdef_dt)
    if (
        identifiers["DATA_SET_ID"] == "VG1-S-LECP-3-RDR-STEP-6MIN-V1.0"
        and name == "SPREADSHEET"
    ):
        return True, formats.voyager.lecp_vg1_sat_table_loader(fn, fmtdef_dt)
    if identifiers["DATA_SET_ID"] == "VL2-M-SEIS-5-RDR-V1.0" and name in (
        "TABLE",
        "SPREADSHEET",
    ):
        return True, formats.viking.seis_table_loader(fn, fmtdef_dt)
    if (
        "MEX-M-ASPERA3-2-EDR-IMA" in identifiers["DATA_SET_ID"]
        and name == "SPREADSHEET"
    ):
        return True, formats.mex.aspera_table_loader(fn, fmtdef_dt)
    if (
        re.match(r"MER[12]-M-RSS-1-EDR-V1.0", identifiers["DATA_SET_ID"])
        and identifiers["PRODUCT_TYPE"] == "UHFD"
        and name == "SPREADSHEET"
    ):
        return True, formats.mer.rss_spreadsheet_loader(fn, fmtdef_dt)
    if (
        identifiers["DATA_SET_ID"] == "PHX-M-MECA-4-NIRDR-V1.0"
        and identifiers["INSTRUMENT_ID"] == "MECA_AFM"
        and "TABLE" in name
    ):
        return True, formats.phoenix.afm_table_loader(fn, fmtdef_dt, name)
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "MARS EXPRESS"
        and identifiers["INSTRUMENT_ID"] == "MRS"
        and all(x in identifiers["PRODUCT_ID"] for x in ["ODF", "L1B"])
        and "TABLE" in name
    ):
        return True, formats.mex.mrs_l1b_odf_table_loader(fn, fmtdef_dt)
    if (
        identifiers["DATA_SET_ID"] == "LRO-L-MRFLRO-1-PDR-V1.0"
        and "HK_" in identifiers["PRODUCT_ID"]
        and name == "SPREADSHEET"
    ):
        return True, formats.lro.mini_rf_spreadsheet_loader(fn, fmtdef_dt)
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "MARS SCIENCE LABORATORY"
        and identifiers["INSTRUMENT_ID"] == "REMS"
        and "TABLE" in name
        and "SP_____" in identifiers["PRODUCT_ID"]
    ):
        return True, formats.msl_rems.edr_table_loader(
            fn, fmtdef_dt, block, start_byte
        )
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "MARS SCIENCE LABORATORY"
        and identifiers["INSTRUMENT_ID"] == "REMS"
        and name == "REMS_SCIENCE_TABLE"
        and "RDR" in identifiers["DATA_SET_ID"]
    ):
        return True, formats.msl_rems.rdr_table_loader(fn, fmtdef_dt)
    if (
        all(x in identifiers["DATA_SET_ID"] for x in ["ICE-C-", "-3-RDR-"])
        and "TRAJ_ICE" in fn
        and name == "TABLE"
    ):
        return True, formats.ihw.add_newlines_table_loader(
            fmtdef_dt, block, fn, start_byte
        )
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "LUNAR RECONNAISSANCE ORBITER"
        and identifiers["INSTRUMENT_NAME"] == "RADIO SCIENCE SUBSYSTEM"
        and name == "WEAREC_TABLE"
    ):
        return True, formats.lro.wea_table_loader(fn, fmtdef_dt)
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "LUNAR PROSPECTOR"
        and identifiers["PRODUCT_ID"] == "OUTAGES"
        and name == "TABLE"
    ):
        return True, formats.lp.ancillary_table_loader(fn, fmtdef_dt)
    if (
        identifiers["DATA_SET_ID"] == "ODY-M-GRS-2-EDR-V1.0"
        and identifiers["PRODUCT_TYPE"] == "E_KERNEL"
        and ".txt" in fn.lower()
    ):
        return formats.odyssey.grs_e_kernel_loader(name, fn)
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "VENUS EXPRESS"
        and identifiers["INSTRUMENT_ID"] in ["VRA"]
        and identifiers['PRODUCT_TYPE'] == "UDR"
        and ".RAW" in identifiers["PRODUCT_ID"]

    ):
        return True, formats.vex_vera.udr_table_loader(fn)
    # this doesn't exist yet
    # if (
    #     identifiers["INSTRUMENT_ID"] == "MOLA"
    #     and ".B" in identifiers["FILE_NAME"]
    #     and "AEDR" in identifiers["DATA_SET_ID"]
    # ):
    #     return True, formats.mgs.aedr_table_loader(fn, name, fmtdef_dt, block,
    #                                                start_byte,
    #                                                identifiers['RECORD_BYTES'])
    if (
        identifiers["INSTRUMENT_ID"] == "SPEDE"
        and "PD_40_" in identifiers["FILE_NAME"]
        and ".TAB" in identifiers["FILE_NAME"]
    ):
        return True, formats.smart1_esa.spede_plasma40_table_reader(fn)

    return False, None

check_trivial_case(pointer: str, identifiers: DataIdentifiers, fn: str) -> bool

Supplement generic definition of 'trivial' pointers. Intended primarily to preempt attempts to load known-unsupported data objects associated with otherwise-supported products. Called inline by pointer_to_loader().

Source code in pdr/formats/checkers.py
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
def check_trivial_case(pointer: str, identifiers: DataIdentifiers, fn: str) -> bool:
    """
    Supplement generic definition of 'trivial' pointers. Intended primarily to
    preempt attempts to load known-unsupported data objects associated with
    otherwise-supported products. Called inline by `pointer_to_loader()`.
    """
    if is_trivial(pointer):
        return True
    if (
        identifiers["INSTRUMENT_ID"] == "APXS"
        and "ERROR_CONTROL_TABLE" in pointer
    ):
        return formats.msl_apxs.table_loader(pointer)
    if (
        identifiers["INSTRUMENT_NAME"] == "TRIAXIAL FLUXGATE MAGNETOMETER"
        and pointer == "TABLE"
        and "-EDR-" in identifiers["DATA_SET_ID"]
    ):
        return formats.galileo.galileo_table_loader()
    if (
        identifiers["INSTRUMENT_NAME"]
        == "CHEMISTRY CAMERA REMOTE MICRO-IMAGER"
        and pointer == "IMAGE_REPLY_TABLE"
    ):
        return formats.msl_ccam.image_reply_table_loader()
    if identifiers["DATA_SET_ID"].startswith("ODY-M-THM-5") and (
        pointer in ("HEADER", "HISTORY")
    ):
        return formats.themis.trivial_themis_geo_loader(pointer)
    # if re.match(
    #     r"CO-(CAL-ISS|[S/EVJ-]+ISSNA/ISSWA-2)", identifiers["DATA_SET_ID"]
    # ):
    #     if pointer == "LINE_PREFIX_TABLE":
    #         return formats.cassini.trivial_loader(pointer)
    if (
        identifiers["DATA_SET_ID"] == "CO-CAL-ISS-2-V1.0"
        and pointer in ("TELEMETRY_TABLE",
                        "LINE_PREFIX_TABLE")
        and identifiers["FILE_RECORDS"] == 1025
    ):
        return formats.cassini.iss_cal_trivial_loader(pointer)
    if (
        identifiers["SPACECRAFT_NAME"] == "MAGELLAN"
        and (fn.endswith(".img") or fn.endswith(".ibg"))
        and pointer == "TABLE"
    ):
        return formats.mgn.orbit_table_in_img_loader()
    if (
        "GO-A-SSI-3-" in identifiers["DATA_SET_ID"]
        and "-CALIMAGES-V1.0" in identifiers["DATA_SET_ID"]
        and "QUB" in identifiers["PRODUCT_ID"]
        and pointer == "HEADER"
    ):
        return formats.galileo.ssi_cubes_header_loader()
    if identifiers["INSTRUMENT_ID"] == "CHEMIN" and (pointer == "HEADER"):
        return formats.msl_cmn.trivial_header_loader()
    if "MSL-M-SAM-" in identifiers["DATA_SET_ID"] and "FILE" in pointer:
        # reusing the msl_cmn special case for msl_sam 'FILE' pointers
        return formats.msl_cmn.trivial_header_loader()
    if (
        identifiers["INSTRUMENT_ID"] == "NIMS"
        and identifiers["SPACECRAFT_NAME"] == 'GALILEO_ORBITER'
        and pointer == "SAMPLE_SPECTRUM_QUBE"
    ):
        return formats.galileo.nims_sample_spectral_qube_trivial_loader()
    if (
        identifiers["DATA_SET_ID"] == "BUGLAB-L-BUG-4-APOLLO-SAMPLES-V1.0"
        and pointer == "HEADER"
    ):
        return formats.ground.trivial_header_loader()
    if (
        identifiers["DATA_SET_ID"] == "MSL-M-APXS-4/5-RDR-V1.0"
        and pointer == "HEADER"
    ):
        return formats.msl_apxs.trivial_header_loader()
    if (
            identifiers["INSTRUMENT_HOST_NAME"] == "VENUS EXPRESS"
            and identifiers["INSTRUMENT_ID"] in ["VRA"]
            and "1A_TNF" in identifiers["PRODUCT_ID"]
    ):
        return formats.vex_vera.trvial_dsn_table()
    if (
            identifiers["INSTRUMENT_HOST_NAME"] == "VENUS EXPRESS"
            and identifiers["INSTRUMENT_ID"] in ["VIRTIS"]
            and pointer == "HISTORY"
    ):
        return formats.vex_virtis.trivial_history()
    if (
        "MRO-M-MCS-5-DDR" in identifiers["DATA_SET_ID"]
        and "V1.0" in identifiers["DATA_SET_ID"]
    ):
        return formats.mro.mcs_ddr_oldformat_trivial()
    if (
        identifiers["DATA_SET_ID"] == 'SLN-L-PACE-3-PBF1-V3.0'
        and pointer == 'TIME_SERIES'
    ):
        return formats.kaguya.pace_time_series_trivial()
    if (
            "SLN-L-SP" in identifiers['DATA_SET_ID']
            and "L2D_RESULT_ARRAY" in pointer
    ):
        return formats.kaguya.sp_l2d_result_array_trivial()
    if (
        identifiers['DATA_SET_ID'] == "SLN-L-TC-4-DEM-ORTHO-V1.0"
        and pointer == "QA_FILENAME"
    ):
        return formats.kaguya.sp_tc_filename_pointer_trivial()
    if (
        identifiers['DATA_SET_ID'] == "SLN-L-GRS-3-ENG-SPECTRUM-V1.0"
        and pointer == "TABLE"
    ):
        return formats.kaguya.grs_eng_tables_trivial()
    if (
        "CE3_BMYK_VNIS-CC_SCI_" in fn
        and pointer == "CAL_TARGET_DATA"
    ):
        return formats.change.cal_target_data_trivial()
    if (
        identifiers['SPACECRAFT_NAME'] == 'CE1'
        and "IMAGE_PREFIX" in pointer
    ):
        return formats.change.image_prefix_trivial()
    return False

is_trivial(pointer: str) -> bool

Returns True if this is the name of a data object we want to handle trivally, in the sense that we never ever want to load it directly.

Source code in pdr/loaders/utility.py
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def is_trivial(pointer: str) -> bool:
    """
    Returns True if this is the name of a data object we want to handle
    trivally, in the sense that we never ever want to load it directly.
    """
    # TIFF tags / headers should always be parsed by the TIFF parser itself
    if (
        ("TIFF" in pointer)
        and ("IMAGE" not in pointer)
        and ("DOCUMENT" not in pointer)
    ):
        return True
    # we don't present STRUCTURES separately from their tables
    if "STRUCTURE" in pointer:
        return True
    # only in MSL CCAM products; probably for internal processing pipelines
    if "PDS_OBJECT" in pointer:
        return True
    return False

special_image_constants(identifiers: DataIdentifiers) -> dict[str, int]

Defines 'secret' special constants for a dataset or product type. Called inline by Data.find_special_constants().

Source code in pdr/formats/checkers.py
1147
1148
1149
1150
1151
1152
1153
1154
1155
def special_image_constants(identifiers: DataIdentifiers) -> dict[str, int]:
    """
    Defines 'secret' special constants for a dataset or product type. Called
    inline by `Data.find_special_constants()`.
    """
    consts = {}
    if identifiers["INSTRUMENT_ID"] == "CRISM":
        consts["NULL"] = 65535
    return consts

specialblock(data: PDRLike, name: str)

Special-purpose wrapper for check_special_block() intended for use outside of the query workflow.

Source code in pdr/formats/checkers.py
804
805
806
807
808
809
810
811
812
def specialblock(data: PDRLike, name: str):
    """
    Special-purpose wrapper for check_special_block() intended for use
    outside of the query workflow.
    """
    is_special, block = check_special_block(name, data, data.identifiers)
    if is_special is True:
        return block
    return data.metablock_(name)

formats.cassini

cda_table_filename(data)

HITS: * cassini_cda * cda_area * cda_stat * cda_events * cda_spectra * cda_settings * cda_counter * cda_signals

Source code in pdr/formats/cassini.py
190
191
192
193
194
195
196
197
198
199
200
201
202
203
def cda_table_filename(data):
    """
    HITS:
    * cassini_cda
        * cda_area
        * cda_stat
        * cda_events
        * cda_spectra
        * cda_settings
        * cda_counter
        * cda_signals
    """
    label = Path(data.labelname)
    return True, Path(label.parent, f"{label.stem}.TAB")

coiss_1006_offset(data, name, identifiers)

Start bytes (given in RECORD_BYTEs) are off by 1 for products from volume coiss_1006. ("Range (SCLK): 1359362956 - 1363539029") Easy to validate: if the TELEMETRY_TABLE's NULL_PADDING column is not 0, then start_byte is off for all that product's pointers except IMAGE_HEADER

HITS: * cassini_iss * calib_evj (partial)

Source code in pdr/formats/cassini.py
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
def coiss_1006_offset(data, name, identifiers):
    """
    Start bytes (given in RECORD_BYTEs) are off by 1 for products from volume 
    coiss_1006. ("Range (SCLK): 1359362956 - 1363539029")
    Easy to validate: if the TELEMETRY_TABLE's NULL_PADDING column is not 0, 
    then start_byte is off for all that product's pointers except IMAGE_HEADER

    HITS:
    * cassini_iss
        * calib_evj (partial)
    """
    if name == "IMAGE_HEADER":
        return False, None
    start_byte = identifiers["RECORD_BYTES"] * (data.metaget(f"^{name}")[1] - 2)
    return True, start_byte

get_offset(filename, identifiers)

HITS: * cassini_hp * ddr * img_table * strip * solar * sun * time * vis

Source code in pdr/formats/cassini.py
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
def get_offset(filename, identifiers):
    """
    HITS:
    * cassini_hp
        * ddr
        * img_table
        * strip
        * solar
        * sun
        * time
        * vis
    """
    if any(sub in filename for sub in ["ULVS_DDP", "DLIS_AZ_DDP", "DLV_DDP"]):
        row_bytes = identifiers["ROW_BYTES"]
    else:
        row_bytes = identifiers["ROW_BYTES"] + 1
    rows = identifiers["ROWS"]
    start_byte = count_from_bottom_of_file(
        filename, rows, row_bytes=row_bytes
    )
    return True, start_byte

get_position(identifiers, block, target, name, filename, start_byte)

HITS: * cassini_hp * dark * ddr * misc_img_text * img_table * strip * solar * sun * time * vis_extra * vis

Source code in pdr/formats/cassini.py
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
def get_position(identifiers, block, target, name, filename, start_byte):
    """
    HITS:
    * cassini_hp
        * dark
        * ddr
        * misc_img_text
        * img_table
        * strip
        * solar
        * sun
        * time
        * vis_extra
        * vis
    """
    if "IR_" in filename:
        tbd(name, block)
    table_props = table_position(identifiers, block, target, name, start_byte)
    n_records = identifiers["ROWS"]
    if any(sub in filename for sub in ["ULVS_DDP", "DLIS_AZ_DDP", "DLV_DDP"]):
        record_bytes = identifiers["ROW_BYTES"]
    else:
        record_bytes = identifiers["ROW_BYTES"] + 1
    length = n_records * record_bytes
    if name == "HEADER":
        tab_size = length
        if isinstance(filename, list):
            filename = filename[0]
        file = Path(filename)
        file_size = os.path.getsize(file)
        length = file_size - tab_size
        start = 0
        table_props["start"] = start
    table_props["length"] = length
    return table_props

get_special_qube_band_storage()

HITS: * cassini_uvis * fuv * euv

Source code in pdr/formats/cassini.py
219
220
221
222
223
224
225
226
227
def get_special_qube_band_storage():
    """
    HITS:
    * cassini_uvis
        * fuv
        * euv
    """
    band_storage_type = "BAND_SEQUENTIAL"
    return True, band_storage_type

get_structure(block, name, filename, data, identifiers)

the data type that goes here double defines the 32 byte prefix/offset. By skipping the parse_table_structure we never add the prefix bytes so it works as is.

HITS: * cassini_hp * hasi_acc * hasi_ppi * hasi_pwa * hasi_tem * hasi_dpu * hasi_prof

Source code in pdr/formats/cassini.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
def get_structure(block, name, filename, data, identifiers):
    """
    the data type that goes here double defines the 32 byte prefix/offset.
    By skipping the parse_table_structure we never add the prefix bytes so
    it works as is.

    HITS:
    * cassini_hp
        * hasi_acc
        * hasi_ppi
        * hasi_pwa
        * hasi_tem
        * hasi_dpu
        * hasi_prof
    """
    # (added HASI/HUY if block after this comment)
    fmtdef = pdr.loaders.queries.read_table_structure(
        block, name, filename, data, identifiers
    )
    if ("HASI" in filename) or ("HUY_DTWG_ENTRY_AERO" in filename):
        if "HUY_DTWG_ENTRY_AERO" in filename:
            fmtdef.at[
                5, "NAME"
            ] = "KNUDSEN FREESTR. HARD SPHERE NR. [=2.8351E-8/RHO]"
            fmtdef.at[6, "NAME"] = "KNUDSEN NR. [=1.2533*SQRT(2)*Ma/Re]"
            fmtdef.at[7, "NAME"] = "REYNOLD NR. [=RHO*VREL*D/Mu]"
        dt = None
    else:
        from pdr.pd_utils import insert_sample_types_into_df, compute_offsets

        fmtdef = compute_offsets(fmtdef)
        fmtdef, dt = insert_sample_types_into_df(fmtdef, identifiers)
    return fmtdef, dt

iss_cal_trivial_loader(pointer)

A subset of the ISS calibration images (those with "FILE_RECORDS = 1025") appear to not actually have LINE_PREFIX_TABLEs or TELEMETRY_TABLEs

HITS * cassini_iss * calib (partial)

Source code in pdr/formats/cassini.py
292
293
294
295
296
297
298
299
300
301
302
303
304
def iss_cal_trivial_loader(pointer):
    """
    A subset of the ISS calibration images (those with "FILE_RECORDS = 1025") 
    appear to not actually have LINE_PREFIX_TABLEs or TELEMETRY_TABLEs

    HITS
    * cassini_iss
        * calib (partial)
    """
    warnings.warn(
        f"This product's {pointer} does not appear to exist."
    )
    return True

iss_calib_da_special_block(data, name)

The labels for some Cassini ISS calibration images with a .DA filename extension incorrectly use LINE_PREFIX_BYTES. A subset of calibration images with a .IMG filename extension are formatted like the .DA products, and also incorrectly reference LINE_PREFIX_BYTES

HITS * cassini_iss * calib_da * calib (partial)

Source code in pdr/formats/cassini.py
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
def iss_calib_da_special_block(data, name):
    """
    The labels for some Cassini ISS calibration images with a .DA filename 
    extension incorrectly use LINE_PREFIX_BYTES.
    A subset of calibration images with a .IMG filename extension are formatted 
    like the .DA products, and also incorrectly reference LINE_PREFIX_BYTES

    HITS
    * cassini_iss
        * calib_da
        * calib (partial)
    """
    block = data.metablock_(name)
    if "LINE_PREFIX_BYTES" in block:
        del block["LINE_PREFIX_BYTES"]
        return True, block
    return False, block

iss_edr_special_block(data, name)

Some of the ISS EDR and calibration products give their ^STRUCTURE and ^LINE_PREFIX_STRUCTURE filenames in the format: "../../label/prefix3.fmt"

HITS * cassini_iss * edr_sat * edr_evj * calib (partial)

Source code in pdr/formats/cassini.py
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
def iss_edr_special_block(data, name):
    """
    Some of the ISS EDR and calibration products give their ^STRUCTURE and 
    ^LINE_PREFIX_STRUCTURE filenames in the format: "../../label/prefix3.fmt"

    HITS
    * cassini_iss
        * edr_sat
        * edr_evj
        * calib (partial)
    """
    block = data.metablock_(name)
    if name == "LINE_PREFIX_TABLE" and "/" in block["^LINE_PREFIX_STRUCTURE"]:
        block["^LINE_PREFIX_STRUCTURE"] = block["^LINE_PREFIX_STRUCTURE"].split("/")[-1]
        return True, block
    elif name == "TELEMETRY_TABLE" and "/" in block["^STRUCTURE"]:
        block["^STRUCTURE"] = block["^STRUCTURE"].split("/")[-1]
        return True, block
    return False, block

iss_telemetry_bit_col_format(obj, definition)

The format file for Cassini ISS telemetry tables incorrectly uses BIT_DATA_TYPE instead of DATA_TYPE when defining its top-level COLUMN (causing a key error in add_bit_column_info()). It also says the data type is BINARY instead of (presumably) MSB_BIT_STRING.

HITS: * cassini_iss * calib * calib_atm * edr_evj * edr_sat

Source code in pdr/formats/cassini.py
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
def iss_telemetry_bit_col_format(obj, definition):
    """
    The format file for Cassini ISS telemetry tables incorrectly uses 
    BIT_DATA_TYPE instead of DATA_TYPE when defining its top-level COLUMN 
    (causing a key error in add_bit_column_info()). It also says the data type 
    is BINARY instead of (presumably) MSB_BIT_STRING. 

    HITS:
    * cassini_iss
        * calib
        * calib_atm
        * edr_evj
        * edr_sat
    """
    # modify and return `obj`
    obj["DATA_TYPE"] = "MSB_BIT_STRING"
    # may as well fix it in `definition` too
    definition["DATA_TYPE"] = "MSB_BIT_STRING"

    return True, obj

line_prefix_sample_type(base_samp_info)

Each time byte order is specified for these products it is LSB. However, for columns whose values can be verified, it is always actually MSB. This special case forces all such types to MSB, and assumes BIT_STRING refers to MSB_BIT_STRING. "N/A" samples are treated as CHARACTER / void.

HITS * cassini_iss * calib * calib_atm * edr_evj * edr_sat

Source code in pdr/formats/cassini.py
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
def line_prefix_sample_type(base_samp_info):
    """
    Each time byte order is specified for these products it is LSB. However,
    for columns whose values can be verified, it is always actually MSB. This
    special case forces all such types to MSB, and assumes BIT_STRING refers to
    MSB_BIT_STRING. "N/A" samples are treated as CHARACTER / void.

    HITS
    * cassini_iss
        * calib
        * calib_atm
        * edr_evj
        * edr_sat
    """
    from pdr.datatypes import sample_types

    sample_type = base_samp_info["SAMPLE_TYPE"]
    sample_bytes = base_samp_info["BYTES_PER_PIXEL"]
    if "N/A" in sample_type:
        sample_type = "VOID"
    elif "LSB" in sample_type:
        sample_type = sample_type.replace("LSB", "MSB")
    elif sample_type == "BIT_STRING":
        sample_type = "MSB_BIT_STRING"
    else:
        return False, None
    return True, sample_types(sample_type, int(sample_bytes), for_numpy=True)

looks_like_ascii(data, pointer)

Source code in pdr/formats/cassini.py
89
90
91
92
93
94
95
def looks_like_ascii(data, pointer):
    """"""
    return (
        ("SPREADSHEET" in pointer)
        or ("ASCII" in pointer)
        or (data.metablock(pointer).get("INTERCHANGE_FORMAT") == "ASCII")
    )

rpws_ancil_position(identifiers, block, target, name, start_byte)

Most of the labels have the wrong ROWS value. This special case uses the FILE_RECORDS instead.

HITS: * cassini_rwps * ancil_tol

Source code in pdr/formats/cassini.py
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
def rpws_ancil_position(identifiers, block, target, name, start_byte):
    """
    Most of the labels have the wrong ROWS value. This special case uses the 
    FILE_RECORDS instead.

    HITS:
    * cassini_rwps
        * ancil_tol
    """
    table_props = table_position(identifiers, block, target, name, start_byte)

    n_records = identifiers["FILE_RECORDS"]
    record_bytes = identifiers["ROW_BYTES"]

    table_props["length"] = n_records * record_bytes
    return table_props

spreadsheet_loader(filename, fmtdef_dt, data_set_id)

HITS: * cassini_mimi * edr_lemms (partial) * rdr_chems_avg * rdr_chems_fullres * rdr_inca * rdr_lemms_avg * rdr_lemms_fullres * rdr_ancil * cassini_radar * asum * cassini_rpws * refdr_wbr * refdr_wfr

Source code in pdr/formats/cassini.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def spreadsheet_loader(filename, fmtdef_dt, data_set_id):
    """
    HITS:
    * cassini_mimi
        * edr_lemms (partial)
        * rdr_chems_avg
        * rdr_chems_fullres
        * rdr_inca
        * rdr_lemms_avg
        * rdr_lemms_fullres
        * rdr_ancil
    * cassini_radar
        * asum
    * cassini_rpws
        * refdr_wbr
        * refdr_wfr
    """
    import pandas as pd

    if "UNCALIB" in data_set_id:
        return pd.read_csv(filename)
    fmtdef, dt = fmtdef_dt
    names = fmtdef.NAME
    header = None
    if "FULL" in filename:
        skiprows = 4
        if data_set_id == "CO-S-MIMI-4-CHEMS-CALIB-V1.0":
            header = 0
            names = None
            skiprows = range(1, 4)
    elif data_set_id == "CO-SSA-RADAR-3-ABDR-SUMMARY-V1.0":
        skiprows = 0
    else:
        skiprows = 7
    table = pd.read_csv(
        filename, header=header, skiprows=skiprows, names=names
    )
    return table

trivial_loader(pointer)

HITS * cassini_iss * calib * edr_evj * edr_sat

Source code in pdr/formats/cassini.py
175
176
177
178
179
180
181
182
183
184
185
186
187
def trivial_loader(pointer):
    """
    HITS
    * cassini_iss
        * calib
        * edr_evj
        * edr_sat
    """
    warnings.warn(
        f"The Cassini ISS EDR/calibration {pointer} tables are not currently "
        f"supported."
    )
    return True

xdr_redirect_to_image_block(data)

HITS: * cassini_hp * img_xdr

Source code in pdr/formats/cassini.py
208
209
210
211
212
213
214
215
216
def xdr_redirect_to_image_block(data):
    """
    HITS:
    * cassini_hp
        * img_xdr
    """
    object_name = "IMAGE"
    block = data.metablock_(object_name)
    return block

formats.checkers

This module contains functions that preempt generic metadata- or data-parsing behaviors. They are intended to manage idiosyncracies common to all products of a particular type (or even all products in a whole dataset), including but not limited to:

  • Malformatted labels
  • Incorrect metadata
  • Malformatted data
  • Technically correct but extremely unusual data formatting

To put this another way, they facilitate single-dispatch polymorphism on the semantic level of data product type.

Most functions in this file are intended to be applied by func.specialize as wrappers to other functions, typically a query function in loaders.queries or the loader_function attribute of a loaders.queries.Loader subclass. However, this is not strict; they may also wrap functions in other modules, and functions may call them inline rather than use them as wrappers.

Every function in this module should be named check_special_{something}, where 'something' clearly designates the metadata-parsing or data-loading behavior it may sometimes preempt.

Most functions in this module should return a tuple whose first element is a bool and whose second element is the "special" value. If the first element is True, it means that there is a relevant special case, so the caller should use the "special" value instead of engaging in its normal behavior; if it is False, there is no relevant special case and the caller should continue with its normal behavior. The second element of the tuple should always be None if the first element is False.

If the function is intended to wrap a generic function, the second element of this tuple, when not None, must always share the return type of that generic function. Also, if it is intended for the func.softquery() workflow, it should follow that workflow's argument naming and type annotation conventions.

Exceptions to these naming and signature conventions can be made for checkers designed specifically to be called inline of a specific handler function.

check_special_bit_column_case(identifiers: Mapping[str, Any]) -> tuple[bool, Optional[str]]

Special case checker used by bit_handling.set_bit_string_data_type() to preempt generic data type inference.

Source code in pdr/formats/checkers.py
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
def check_special_bit_column_case(
    identifiers: Mapping[str, Any]
) -> tuple[bool, Optional[str]]:
    """
    Special case checker used by `bit_handling.set_bit_string_data_type()`
    to preempt generic data type inference.
    """
    instrument = identifiers["INSTRUMENT_NAME"]
    if instrument in (
        "ALPHA PARTICLE X-RAYSPECTROMETER",
        "JOVIAN AURORAL PLASMA DISTRIBUTIONS EXPERIMENT",
        "CHEMISTRY AND MINERALOGY INSTRUMENT",
        "MARS ADVANCED RADAR FOR SUBSURFACE ANDIONOSPHERE SOUNDING",
    ):
        return True, "MSB_BIT_STRING"
    return False, None

check_special_bit_format(obj: dict, definition: MultiDict, identifiers: DataIdentifiers) -> tuple[bool, Optional[dict]]

Special case checker used by add_bit_column_info() to fix problems in obj and/or definition caused by mistakes in an external format file. Intended for cases where check_special_block() doesn't touch the relevant metadata, and errors are hit before check_special_structure() can be useful.

Source code in pdr/formats/checkers.py
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
def check_special_bit_format(
    obj: dict,
    definition: MultiDict,
    identifiers: DataIdentifiers
) -> tuple[bool, Optional[dict]]:
    """
    Special case checker used by add_bit_column_info() to fix problems in `obj` 
    and/or `definition` caused by mistakes in an external format file. Intended 
    for cases where check_special_block() doesn't touch the relevant metadata, 
    and errors are hit before check_special_structure() can be useful.
    """
    if re.match(
        r"CO-(CAL-ISS|[S/EVJ-]+ISSNA/ISSWA-2)", identifiers["DATA_SET_ID"]
    ):
        return formats.cassini.iss_telemetry_bit_col_format(obj, definition)
    if (
        identifiers["SPACECRAFT_NAME"] == "GALILEO ORBITER"
        and identifiers["INSTRUMENT_NAME"] in ("SOLID_STATE_IMAGING", 
                                               "SOLID STATE IMAGING SYSTEM")
    ):
        return formats.galileo.ssi_redr_bit_col_format(definition)
    return False, None

check_special_bit_start_case(identifiers, list_of_pvl_objects_for_bit_columns, start_bit_list) -> tuple[bool, Optional[list[int]]]

Special case checker used by get_bit_start_and_size() to fix incorrectly-defined bit offsets.

Source code in pdr/formats/checkers.py
766
767
768
769
770
771
772
773
774
775
776
777
def check_special_bit_start_case(
    identifiers, list_of_pvl_objects_for_bit_columns, start_bit_list
) -> tuple[bool, Optional[list[int]]]:
    """
    Special case checker used by get_bit_start_and_size() to fix
    incorrectly-defined bit offsets.
    """
    if identifiers["INSTRUMENT_NAME"] in "JOVIAN INFRARED AURORAL MAPPER":
        return formats.juno.bit_start_find_and_fix(
            list_of_pvl_objects_for_bit_columns, start_bit_list
        )
    return False, None

check_special_block(name: str, data: PDRLike, identifiers: Mapping) -> tuple[bool, Optional[MultiDict]]

specialize() target for queries.get_block(). Intended for cases in which label pointers don't correspond to label block names AND/OR if a value within the block needs to be changed before going to other functions.

Source code in pdr/formats/checkers.py
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
def check_special_block(
    name: str, data: PDRLike, identifiers: Mapping
) -> tuple[bool, Optional[MultiDict]]:
    """
    `specialize()` target for `queries.get_block()`. Intended for cases in
    which label pointers don't correspond to label block names AND/OR if a
    value within the block needs to be changed before going to other functions.
    """
    if name == "XDR_DOCUMENT":
        return True, formats.cassini.xdr_redirect_to_image_block(data)
    if name == "CHMN_HSK_HEADER_TABLE":
        return True, formats.msl_cmn.fix_mangled_name(data)
    if (
        identifiers["DATA_SET_ID"].startswith("JNO-E/J/SS")
        and "BSTFULL" in identifiers["DATA_SET_ID"]
        and "FREQ_OFFSET_TABLE" in data.keys()
        and name in ("FREQ_OFFSET_TABLE", "DATA_TABLE")
    ):
        return True, formats.juno.waves_burst_fix_table_names(data, name)
    if (
        identifiers["INSTRUMENT_ID"] == "LAMP"
        and identifiers["PRODUCT_TYPE"] == "RDR"
        and "IMAGE" in name
        and "HISTOGRAM" in name
    ):
        # multiple image objects are defined by one non-unique image object
        return True, formats.lro.lamp_rdr_histogram_image_loader(data)
    if (
        identifiers["DATA_SET_ID"] == "LRO-L-MRFLRO-5-GLOBAL-MOSAIC-V1.0"
        and "GLOBAL_S4_32PPD" in data.metaget_("PRODUCT_ID")
        and name == "IMAGE"
    ):
        # typo in one of the labels
        return True, formats.lro.mini_rf_image_loader(data, name)
    if (
        identifiers["DATA_SET_ID"] == "PVO-V-ORPA-5-ELE/ION/PHOTO/UADS-V1.0"
        and "ORPA_LOW_RES" in identifiers["PRODUCT_ID"]
        and name == "TABLE"
    ):
        return True, formats.pvo.orpa_low_res_loader(data, name)
    if (
        identifiers["DATA_SET_ID"] == "PVO-V-OIMS-4-IONDENSITY-12S-V1.0"
        and name == "TABLE"
    ):
        return True, formats.pvo.oims_12s_loader(data, name)
    if (
        "GO-E-EPD-4-SUMM-" in identifiers["DATA_SET_ID"]
        and "E1_" in identifiers["PRODUCT_ID"]
        and name == "TIME_SERIES"
    ):
        return True, formats.galileo.epd_special_block(data, name)
    if (
        identifiers["INSTRUMENT_NAME"] == "PLASMA WAVE RECEIVER"
        and "SUMM" in identifiers["DATA_SET_ID"]
        and (name == "TIME_SERIES" or name == "TABLE")
    ):
        return True, formats.galileo.pws_special_block(data, name)
    if "ULY-J-EPAC-4-SUMM" in identifiers["DATA_SET_ID"] and name == "TABLE":
        return True, formats.ulysses.get_special_block(data, name, identifiers)
    if (
        "VG2-N-MAG-4-RDR-HGCOORDS" in identifiers["DATA_SET_ID"]
        and identifiers["STANDARD_DATA_PRODUCT_ID"] == "ASCII DATA"
        and name == "TABLE"
    ):
        return True, formats.voyager.mag_special_block(data, name)
    if (
        identifiers["DATA_SET_ID"] == "VG2-SS-PLS-4-SUMM-1HR-AVG-V1.0"
        and name == "TABLE"
    ):
        return formats.voyager.pls_avg_special_block(data, name)
    if (
        identifiers["DATA_SET_ID"] == "VG2-SS-PLS-3-RDR-FINE-RES-V1.0"
        and name == "TABLE"
    ):
        return formats.voyager.pls_fine_special_block(data, name)
    if (
        identifiers["DATA_SET_ID"] == "VG2-U-PLS-5-SUMM-IONBR-48SEC-V1.0"
        and identifiers["PRODUCT_ID"] == "SUMRY.DAT"
        and name == "TIME_SERIES"
    ):
        return formats.voyager.pls_ionbr_special_block(data, name)
    if identifiers["DATA_SET_ID"] == "M9-M-IRIS-3-RDR-V1.0" and (
        name == "SPECTRAL_SERIES"  # the data product
        or "SPECTRUM" in name  # the calibration data
    ):
        return True, formats.mariner.get_special_block(data, name)
    if (
        re.match(
            r"IHW-C-MSNRDR-3-RDR-HALLEY-(ETA-AQUAR|ORIONID)-V1.0",
            identifiers["DATA_SET_ID"],
        )
        and name == "TABLE"
    ):
        return True, formats.ihw.get_special_block(data, name)
    if (
        "VG2-" in identifiers["DATA_SET_ID"]
        and "-PRA-3-RDR-LOWBAND-6SEC-V1.0" in identifiers["DATA_SET_ID"]
        and name == "TABLE"
    ):
        return formats.voyager.pra_special_block(data, name, identifiers)
    if (
        identifiers["DATA_SET_ID"] == "PHX-M-MECA-2-NIEDR-V1.0"
        and re.match(r"MECA-EM1[012]", identifiers["PRODUCT_TYPE"])
        and name == "WCHEM_TABLE"
    ):
        return True, formats.phoenix.wcl_edr_special_block(data, name)
    if (
        "MEX-M-PFS-2-EDR-" in identifiers["DATA_SET_ID"]
        and (
            "RAW" in identifiers["PRODUCT_ID"]
            or "HK" in identifiers["PRODUCT_ID"]
        )
        and name == "TABLE"
    ):
        return formats.mex.pfs_edr_special_block(data, name)
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "MARS EXPRESS"
        and identifiers["INSTRUMENT_ID"] == "MRS"
        and all(x in identifiers["PRODUCT_ID"] for x in ["ODF", "L1B", "RMP"])
        and "TABLE" in name
    ):
        return True, formats.mex.mrs_l1b_odf_rmp_redirect(data)
    if (
        identifiers["DATA_SET_ID"] == "WFF-E-ATM-1/5-V1.0"
        and name == "IMAGE"
    ):
        return formats.ground.wff_atm_special_block(data, name)
    if (
        identifiers["DATA_SET_ID"] == "CO-CAL-ISS-2-V1.0"
        and name == "IMAGE"
        and (".DA" in data.metaget_("^IMAGE")[0] 
             or identifiers["FILE_RECORDS"] == 1025)
    ):
        return formats.cassini.iss_calib_da_special_block(data, name)
    if (
        identifiers["DATA_SET_ID"] in ("CO-S-ISSNA/ISSWA-2-EDR-V1.0",
                                       "CO-E/V/J-ISSNA/ISSWA-2-EDR-V1.0",
                                       "CO-CAL-ISSNA/ISSWA-2-EDR-V1.0")
        and name in ("LINE_PREFIX_TABLE",
                     "TELEMETRY_TABLE")
    ):
        return formats.cassini.iss_edr_special_block(data, name)
    if (
        identifiers["DATA_SET_ID"] == "MGS-M-MOLA-3-PEDR-L1A-V1.0"
        and "TABLE" in name
    ):
        return True, formats.mgs.mola_pedr_special_block(data, name, identifiers)
    if (
        identifiers["DATA_SET_ID"] == "IUE-C-SWP-3-EDR-IUECDB-V1.0"
        and name == "QUALITY_IMAGE"
    ):
        return formats.iue.get_special_block(data, name)
    if (
        identifiers["SPACECRAFT_NAME"] == "GALILEO ORBITER"
        and identifiers["INSTRUMENT_NAME"] == "SOLID STATE IMAGING SYSTEM"
        and name == "LINE_PREFIX_TABLE"
    ):
        return True, formats.galileo.ssi_prefix_block(data, name)
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "MARS SCIENCE LABORATORY"
        and identifiers["INSTRUMENT_ID"] in ["MAHLI", "MAST_RIGHT",
                                             "MAST_LEFT", "MARDI"]
        and "EDR" in identifiers["DATA_SET_ID"]
        and name == "IMAGE"
    ):
        return True, formats.msl_edr.get_special_block(data, name)
    if (
            identifiers["INSTRUMENT_HOST_NAME"] == "VENUS EXPRESS"
            and identifiers["INSTRUMENT_ID"] in ["VRA"]
            and "1B_RMP_" in identifiers["PRODUCT_ID"]
    ):
        return formats.vex_vera.get_special_block(data, name)
    if (
        identifiers["DATA_SET_ID"] == 'SLN-L-GRS-5-NUCLIDE-MAP-V2.0'
        and name == "TABLE"
    ):
        return True, formats.kaguya.get_special_block_grs_table(data)
    if (
        "SLN-L-SP" in identifiers['DATA_SET_ID']
        and "LEVEL2B" in identifiers['DATA_SET_ID']
        and name == "ANCILLARY_AND_SUPPLEMENT_DATA"
    ):
        return True, formats.kaguya.get_special_block_sp_2b_supp(data, name)
    if(
        "SLN-L-LMAG-5-MA-GRID" in identifiers['DATA_SET_ID']
        and name == "TABLE"
    ):
        return True, formats.kaguya.get_special_grid_table_block()
    if(
        identifiers['DATA_SET_ID'] == "SLN-L-LMAG-5-1D-SIGMA-ECS-V1.0"
        and name == "TABLE"
    ):
        return True, formats.kaguya.get_special_1d_sigma_block()
    if(
        identifiers['DATA_SET_ID'] == "SLN-L-LMAG-3-MAG-TS-V1.0"
        and name == "TABLE"
    ):
        return True, formats.kaguya.get_special_mag_ts_block(data)
    if(
        "SLN-L-RISE-5-TRAJ-" in identifiers['DATA_SET_ID']
        and name == "TABLE"
    ):
        return True, formats.kaguya.rise_traj_special_block()

    return False, None

check_special_compressed_file_reader(identifiers: DataIdentifiers, fn: str)

Distribute to correct specialized image loader, otherwise return False/None. Preempt loaders.datawrap.ReadImage's dispatch to read_image()

Source code in pdr/formats/checkers.py
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
def check_special_compressed_file_reader(identifiers: DataIdentifiers, fn: str):
    """
    Distribute to correct specialized image loader, otherwise return
    False/None. Preempt loaders.datawrap.ReadImage's dispatch to `read_image()`
    """
    if (
            identifiers["INSTRUMENT_HOST_NAME"] == "MARS SCIENCE LABORATORY"
            and identifiers["INSTRUMENT_ID"] in ["MAHLI", "MAST_RIGHT",
                                                 "MAST_LEFT", "MARDI"]
            and "EDR" in identifiers["DATA_SET_ID"]
    ):
        return True, formats.msl_edr.msl_edr_image_loader(fn)
    if (
            identifiers["SPACECRAFT_NAME"] == "MARS_GLOBAL_SURVEYOR"
            and identifiers["INSTRUMENT_ID"] in ["MOC-NA", "MOC-WA"]
            and "IMQ" in identifiers["FILE_NAME"]
    ):
        return True, formats.mgs_moc.mgs_moc_comp_image_loader(fn, identifiers)
    return False, None

check_special_fits_start_byte(identifiers: DataIdentifiers, name: str, hdulist: HDUList) -> tuple[bool, Optional[int]]

Preempts generic PDS3 data object -> FITS start byte mapping. Wraps get_fits_start_byte().

Source code in pdr/formats/checkers.py
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
def check_special_fits_start_byte(
    identifiers: DataIdentifiers, name: str, hdulist: HDUList
) -> tuple[bool, Optional[int]]:
    """
    Preempts generic PDS3 data object -> FITS start byte mapping. Wraps
    `get_fits_start_byte()`.
    """
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "DAWN"
        and "FC2" in identifiers["DATA_SET_ID"]
        and name == "HISTORY"
    ):
        return True, formats.dawn.dawn_history_hdu_exception()
    if (
        identifiers["DATA_SET_ID"].startswith("HST-S-WFPC2-3-RPX")
        and "IMAGE" in name
    ):
        return True, formats.saturn_rpx.rpx_img_hdu_start_byte(name, hdulist)
    if (
        identifiers["INSTRUMENT_ID"] == "HRIV"
        and identifiers["PRODUCT_TYPE"] == "RADIANCE_DECONVOLVED"
        and name.startswith("EXT_MASK")
    ):
        return True, formats.epoxi.hriv_deconv_mask_start_byte(name, hdulist)
    if identifiers["DATA_SET_ID"].startswith("MSGR-H-MDIS-6-CAL"):
        return True, formats.galileo.mdis_fits_start_byte(name, hdulist)
    if identifiers["DATA_SET_ID"] == "MSSSO-J-CASPIR-3-RDR-SL9-STDS-V1.0":
        return True, formats.ground.mssso_cal_start_byte(name, hdulist)
    if "MEX-M-VMC-3-RDR" in identifiers["DATA_SET_ID"]:
        return True, formats.mex.vmc_rdr_hdu_selection(name, hdulist)
    if (
        identifiers["DATA_SET_ID"] == "LRO-L-LAMP-2-EDR-V1.0" 
        and "TABLE" in name
    ):
        return formats.lro.lamp_edr_hdu_exceptions(name, hdulist)
    if (
        identifiers["DATA_SET_ID"] == "LRO-L-LAMP-3-RDR-V1.0" 
    ):
        return formats.lro.lamp_rdr_hdu_start_byte(name, hdulist)
    if (
        identifiers['PRODUCT_TYPE'] == "RDR"
        and "JNO-J-UVS-3-RDR-" in identifiers['DATA_SET_ID']
    ):
        return True, formats.juno.uvs_rdr_start_byte(name, hdulist)
    if (
        identifiers['PRODUCT_TYPE'] == "EDR"
        and "JNO-J-UVS-2-EDR-" in identifiers['DATA_SET_ID']
    ):
        return True, formats.juno.uvs_edr_start_byte(name, hdulist)
    return False, None

check_special_fn(data: PDRLike, object_name: str, identifiers: DataIdentifiers) -> tuple[bool, Optional[str]]

Preempts generic filename specification. Called inline by Data._object_to_filename().

Source code in pdr/formats/checkers.py
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
def check_special_fn(
    data: PDRLike, object_name: str, identifiers: DataIdentifiers
) -> tuple[bool, Optional[str]]:
    """
    Preempts generic filename specification. Called inline by
    `Data._object_to_filename()`.
    """
    if (identifiers["DATA_SET_ID"] == "CLEM1-L-RSS-5-BSR-V1.0") and (
        object_name in ("HEADER_TABLE", "DATA_TABLE")
    ):
        # sequence wrapped as string for object names
        return formats.clementine.get_fn(data, object_name)
    if (
        identifiers["SPACECRAFT_NAME"] == "MAGELLAN"
        and (data.filename.endswith(".img") or data.filename.endswith("ibg"))
        and object_name == "TABLE"
    ):
        return formats.mgn.get_fn(data)
    # filenames are frequently misspecified
    if identifiers["DATA_SET_ID"].startswith("CO-D-CDA") and (
        object_name == "TABLE"
    ):
        return formats.cassini.cda_table_filename(data)
    # THEMIS labels don't always mention when a file is stored gzipped
    if identifiers["INSTRUMENT_ID"] == "THEMIS":
        return formats.themis.check_gzip_fn(data, object_name)
    if (
        identifiers["DATA_SET_ID"]
        in (
            "NH-P-PEPSSI-4-PLASMA-V1.0",
            "NH-X-SWAP-5-DERIVED-SOLARWIND-V1.0",
            "NH-P/PSA-LORRI/ALICE/REX-5-ATMOS-V1.0",
        )
        and object_name == "SPREADSHEET"
    ):
        return formats.nh.get_fn(data)
    if (
        identifiers["SPACECRAFT_NAME"] == "GALILEO ORBITER"
        and identifiers["INSTRUMENT_NAME"] == "SOLID_STATE_IMAGING"
        and object_name == "IMAGE_LINE_PREFIX_TABLE"
    ):
        return formats.galileo.ssi_redr_prefix_fn(data)
    if (
        identifiers["DATA_SET_ID"] == "VG2-SR/UR/NR-PPS-2/4-OCC-V1.0"
        and identifiers["PRODUCT_TYPE"] == "JITTER"
        and object_name == "SERIES"
    ):
        return formats.voyager.get_fn(data)
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "MARS SCIENCE LABORATORY"
        and identifiers["INSTRUMENT_ID"] in ["MAHLI", "MAST_RIGHT",
                                             "MAST_LEFT", "MARDI"]
        and "EDR" in identifiers["DATA_SET_ID"] and object_name == "IMAGE"
    ):
        return formats.msl_edr.msl_msss_edr_prefix_fn(data)
    return False, None

check_special_label(fn: Union[str, Path])

Used primarily to check for labels with known characters invalid in utf-8. We then read the label with a more correct or lenient encoding. Preempt loaders.datawrap.ReadLabel's dispatch to read_label(). Also used in read_pvl().

Source code in pdr/formats/checkers.py
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
def check_special_label(fn: Union[str, Path]):
    """
    Used primarily to check for labels with known characters invalid in utf-8.
    We then read the label with a more correct or lenient encoding. Preempt
    loaders.datawrap.ReadLabel's dispatch to `read_label()`. Also used in
    `read_pvl()`.
    """
    if (
            any(tag in fn for tag in ['CE1', 'CE2', 'CE3'])
            and any(sfx in fn for sfx in ['.1A', '.1B', '.1C',
                                                '.2A', '.2B', '.2C',
                                                '.3A', '.3B', '.3C',
                                                '.01', '.02', '.03'])
    ):
        return True, formats.change.special_label(fn)
    if (
            ".TAB" in fn and "_DOY" in fn and
            ("MAG_" in fn or "BIO_" in fn) and "_V1" in fn
    ):
        return True, formats.vex_mag.special_label(fn)
    return False, None

check_special_objects(identifiers: DataIdentifiers)

Check to add objects not correctly ID'd as objects in a label, or remove objects ID'd in a label. Called inline by _find_objects().

Source code in pdr/formats/checkers.py
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
def check_special_objects(identifiers: DataIdentifiers):
    """
    Check to add objects not correctly ID'd as objects in a label, or remove
    objects ID'd in a label. Called inline by `_find_objects()`.
    """
    if (
            identifiers["INSTRUMENT_HOST_NAME"] == "MARS SCIENCE LABORATORY"
            and identifiers["INSTRUMENT_ID"] in ["MAHLI", "MAST_RIGHT",
                                                 "MAST_LEFT", "MARDI"]
            and "EDR" in identifiers["DATA_SET_ID"]
    ):
        # a consequence of this is the geometry file and miniheader objects
        # denoted in the label for the edrs
        return True, ['IMAGE', 'MODEL_DESC']
    if (
            identifiers["SPACECRAFT_NAME"] == "Chang'E-3 Rover"
            and "CE3_BMYK_VNIS-CC_SCI_N" in identifiers["PRODUCT_ID"]
            and ".2A" in identifiers["PRODUCT_ID"]
    ):
        # this would need to be modified if you could find the cal target data
        # Excel file and removed the bad characters from the label around it
        return True, ['LABEL', 'IMAGE_PREFIX', 'IMAGE']
    if (
            identifiers["SPACECRAFT_NAME"] == "Chang'E-3 Rover"
            and "CE3_BMYK_VNIS-SC_SCI_N_" in identifiers["PRODUCT_ID"]
            and ".2A" in identifiers["PRODUCT_ID"]
    ):
        # this would need to be modified if you could find the cal target data
        # Excel file and removed the bad characters from the label around it
        return True, ['LABEL', 'TABLE']
    if (
            identifiers["SPACECRAFT_NAME"] == "Chang'E-3 Rover"
            and "CE3_BMYK_VNIS-SD_SCI_N_" in identifiers["PRODUCT_ID"]
            and ".2A" in identifiers["PRODUCT_ID"]
    ):
        # this would need to be modified if you could find the cal target data
        # Excel file and removed the bad characters from the label around it
        return True, ['LABEL', 'TABLE']
    return False, None

check_special_offset(name: str, data: PDRLike, identifiers: DataIdentifiers, fn: str) -> tuple[bool, Optional[int]]

Preempt generic inference of an object's byte offset within a file. Wraps loaders.queries.data_start_byte().

Source code in pdr/formats/checkers.py
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
def check_special_offset(
    name: str, data: PDRLike, identifiers: DataIdentifiers, fn: str
) -> tuple[bool, Optional[int]]:
    """
    Preempt generic inference of an object's byte offset within a file. Wraps
    `loaders.queries.data_start_byte()`.
    """
    # these incorrectly specify object length rather than
    # object offset in the ^HISTOGRAM pointer target
    if identifiers["INSTRUMENT_ID"] == "CHEMIN":
        return formats.msl_cmn.get_offset(name)
    if (
        identifiers["DATA_SET_ID"] == "CLEM1-L-RSS-5-BSR-V1.0"
        and re.match("(HEADER|DATA)_TABLE", name)
    ):
        # sequence wrapped as string for object names
        return formats.clementine.get_offset(data, name)
    if identifiers["INSTRUMENT_ID"] == "THEMIS" and name == "QUBE":
        return formats.themis.get_qube_offset(data)
    disrsubs = re.compile(r"STRIP|VISIBL|IMAGE|IR_|TIME|SUN|SOLAR")
    if (
        identifiers["INSTRUMENT_NAME"] == "DESCENT IMAGER SPECTRAL RADIOMETER"
        and identifiers["PRODUCT_TYPE"] == "RDR"
        or disrsubs.search(identifiers["FILE_NAME"])
    ):
        return formats.cassini.get_offset(fn, identifiers)
    if (
        identifiers["DATA_SET_ID"] == "CO-E/V/J-ISSNA/ISSWA-2-EDR-V1.0"
        and 1359362956 <= float(data.metaget_("SPACECRAFT_CLOCK_STOP_COUNT"))
        and float(data.metaget_("SPACECRAFT_CLOCK_STOP_COUNT")) <= 1363539029
    ):
        return formats.cassini.coiss_1006_offset(data, name, identifiers)
    if (
        identifiers["INSTRUMENT_ID"] == "CRAT"
        and identifiers["PRODUCT_TYPE"] == "EDR"
        and name == "TABLE_1"
    ):
        return formats.lro.get_crater_offset()
    if (
        identifiers["DATA_SET_ID"] == "PHX-M-SSI-5-ATMOS-OPACITY-V1.0"
        and "TABLE" in name
    ):
        return formats.phoenix.phxao_table_offset(fn, identifiers)
    if (
        identifiers["DATA_SET_ID"] == "PHX-M-MECA-4-NIRDR-V1.0"
        and identifiers["PRODUCT_TYPE"] in ("MECA_WCL_CP", "MECA_WCL_CV")
        and "TABLE" in name
    ):
        return formats.phoenix.wcl_rdr_offset(data, name)
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "MARS SCIENCE LABORATORY"
        and identifiers["INSTRUMENT_ID"] == "REMS"
        and name == "REMS_REPORT_TABLE"
        and ("HSDEF__" in identifiers["PRODUCT_ID"] or 
             "HSREG__" in identifiers["PRODUCT_ID"])
    ):
        return formats.msl_rems.edr_offset(data, name)
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "MARS SCIENCE LABORATORY"
        and identifiers["INSTRUMENT_ID"] in ["MAHLI", "MAST_RIGHT",
                                             "MAST_LEFT", "MARDI"]
        and "EDR" in identifiers["DATA_SET_ID"]
    ):
        return formats.msl_edr.edr_offset(data, name)
    return False, None

check_special_pds4_cases(structure, filename, object_name)

Load objects from PDS4 files with known issues that do not currently work with pds4_tools. Mostly utilized by datasets not verified by the PDS but that have PDS4 labels (ISRO, ESA, CNSA etc).

Source code in pdr/formats/checkers.py
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
def check_special_pds4_cases(structure, filename, object_name):
    """
    Load objects from PDS4 files with known issues that do not currently work
    with pds4_tools. Mostly utilized by datasets not verified by the PDS but
    that have PDS4 labels (ISRO, ESA, CNSA etc).
    """
    if (
            "CE6-L_GRAS_LMS-M" in filename and "SCI" in filename
            and ".2B" in filename and object_name == "TABLE_0"
    ):
        return formats.change.read_change_fw_table(structure, filename)
    if (
            "CE6-L_GRAS_LMS-S" in filename and "SCI" in filename
            and ".2B" in filename and object_name == "TABLE_0"
    ):
        return formats.change.read_change_fw_table(structure, filename)
    if (
            "CE6-L_GRAS_LMS-N" in filename and "SCI" in filename
            and ".2B" in filename and object_name == "TABLE_0"
    ):
        return formats.change.read_change_fw_table(structure, filename)
    if (
            "CE5-L_GRAS_LMS-N" in filename and ".2B" in filename
            and "SCI" in filename and object_name == "TABLE_0"
    ):
        return formats.change.read_change_fw_table(structure, filename)
    if (
            "CE5-L_GRAS_LMS-M" in filename and ".2B" in filename
            and "SCI" in filename and object_name == "TABLE_0"
    ):
        return formats.change.read_change_fw_table(structure, filename)
    if (
            "CE4_GRAS_ASAN-SCI_SCI" in filename and ".2B" in filename
            and object_name == "TABLE_0"
    ):
        return formats.change.read_table_using_spaces(structure, filename)
    if (
            "CE4_GRAS_LND-DPSL_SCI" in filename and ".2A" in filename
            and object_name == "TABLE_0"
    ):
        return formats.change.read_table_using_spaces(structure, filename)
    if (
            "CE4_GRAS_LND-ThN_SCI" in filename and ".2A" in filename
            and object_name == "TABLE_0"
    ):
        return formats.change.read_table_using_spaces(structure, filename)
    if (
            "CE4_GRAS_LND-TID_SCI" in filename and ".2A" in filename
            and object_name == "TABLE_0"
    ):
        return formats.change.read_table_using_spaces(structure, filename)
    # if (
    #         "CE4_GRAS_VNIS-VD_SCI" in filename and ".2B" in filename
    #         and object_name == "TABLE_0"
    # ):
    # this is a fixed width table, but it's not all UTF-8 so PD can't handle it
    #    return formats.change.read_table_using_spaces(structure, filename)
    if (
            "CE4_GRAS_VNIS-SD_SCI" in filename and ".2B" in filename
            and object_name == "TABLE_0"
    ):
        return formats.change.read_change_fw_table(structure, filename)
    if (
            "ch2_cla_l1_" in filename
            and (object_name == "header_Data" or object_name == "data")
    ):
        return formats.ch2_isro.read_class_fits_table(filename, object_name)
    return None

check_special_position(identifiers: DataIdentifiers, block: MultiDict, target: PhysicalTarget, name: str, fn: str, start_byte: int) -> tuple[bool, Optional[int]]

Preempt generic detection of a table's row or byte offset within a file. Wraps table_position(). Used for table-specific cases that are partially but not wholly handled by data_start_byte(), so should not be defined in check_special_offset().

Source code in pdr/formats/checkers.py
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
def check_special_position(
    identifiers: DataIdentifiers,
    block: MultiDict,
    target: PhysicalTarget,
    name: str,
    fn: str,
    start_byte: int,
) -> tuple[bool, Optional[int]]:
    """
    Preempt generic detection of a table's row or byte offset within a file.
    Wraps `table_position()`. Used for table-specific cases that are partially
    but not wholly handled by `data_start_byte()`, so should not be defined
    in `check_special_offset()`.
    """
    if (
        identifiers["INSTRUMENT_ID"] == "MARSIS"
        and " TEC " in identifiers["DATA_SET_NAME"]
    ):
        return True, formats.mex.marsis_get_position(
            identifiers, block, target, name, start_byte
        )
    huysubs = re.compile(r"DARK|STRIP|VIS_EX|SUN|VISIBL|TIME|SOLAR|IMAGE")
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "HUYGENS PROBE"
        and huysubs.search(identifiers["FILE_NAME"])
        or (
            identifiers["INSTRUMENT_NAME"]
            == "DESCENT IMAGER SPECTRAL RADIOMETER"
            and identifiers["PRODUCT_TYPE"] == "RDR"
        )
        and name in ("TABLE", "HEADER")
    ):
        return True, formats.cassini.get_position(
            identifiers, block, target, name, fn, start_byte
        )
    if (
        "CO-V/E/J/S/SS-RPWS-" in identifiers["DATA_SET_ID"]
        and identifiers["PRODUCT_TYPE"] == "ANCILLARY"
        and name == "RPWS_TIME_ORDERED_TABLE"
    ):
        return True, formats.cassini.rpws_ancil_position(
            identifiers, block, target, name, start_byte
        )
    if (
        identifiers["DATA_SET_ID"] == "LRO-L-RSS-1-TRACKING-V1.0"
        and name == "WEAREC_TABLE"
    ):
        return formats.lro.rss_get_position(
            identifiers, block, target, name, start_byte
        )
    if (
        identifiers["DATA_SET_ID"] == "DIF-C-HRIV/MRI-5-HARTLEY2-SHAPE-V1.0"
        and identifiers["PRODUCT_ID"] == "HARTLEY2-CARTESIAN-PLATE-MODEL"
        and "TABLE" in name
    ):
        return True, formats.epoxi.cart_model_get_position(
            identifiers, block, target, name, start_byte
        )
    if (
        "MEX-M-MRS-5-OCC" in identifiers["DATA_SET_ID"]
        and name == "ATM_TABLE"
    ):
        return True, formats.mex.mrs_ddr_atmo_position(
            identifiers, block, target, name, start_byte
        )
    if (
        identifiers["DATA_SET_ID"] == "PHX-M-SSI-5-ATMOS-OPACITY-V1.0"
        and "HEADER" in name
    ):
        return True, formats.phoenix.phxao_header_position(
            identifiers, block, target, name, start_byte
        )
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "MARS EXPRESS"
        and identifiers["INSTRUMENT_ID"] == "MRS"
        and all(x in identifiers["PRODUCT_ID"] for x in ["ICL", "L1B"])
        and name == "DOPPLER_TABLE"
    ):
        return True, formats.mex.mrs_get_position(
            identifiers, block, target, name, start_byte
        )
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "MARS EXPRESS"
        and identifiers["INSTRUMENT_ID"] == "MRS"
        and all(x in identifiers["PRODUCT_ID"] for x in ["ODF", "L02"])
        and name == "RANGING_TABLE"
    ):
        return True, formats.mex.mrs_get_position(
            identifiers, block, target, name, start_byte
        )
    if (
        identifiers["DATA_SET_ID"] in ("ESO1M-SR-APPH-4-OCC-V1.0",
                                       "ESO22M-SR-APPH-4-OCC-V1.0",
                                       "IRTF-SR-URAC-4-OCC-V1.0",
                                       "PAL200-SR-CIRC-4-OCC-V1.0",
                                       "MCD27M-SR-IIRAR-4-OCC-V1.0")
        and "GEOM" in identifiers["PRODUCT_ID"]
        and name == "SERIES"
    ):
        return True, formats.ground.ebrocc_geom_get_position(
            identifiers, block, target, name, start_byte
        )
    if (
        identifiers["DATA_SET_ID"] == "MRO-M-CRISM-5-RDR-MULTISPECTRAL-V1.0"
        and "MRRWV" in identifiers["PRODUCT_ID"]
        and name == "TABLE"
    ):
        return True, formats.mro.crism_mrdr_ancill_position(
            identifiers, block, target, name, start_byte
        )
    if (
        identifiers["DATA_SET_ID"] == "MSX-L-SPIRIT3-2/4-V1.0"
        and name == "ENVI_HEADER"
    ):
        return True, formats.msx.cube_envi_header_position(
            identifiers, block, target, name, start_byte, fn
        )

    if (
            identifiers["INSTRUMENT_HOST_NAME"] == "VENUS EXPRESS"
            and identifiers["INSTRUMENT_ID"] in ["VRA"]
            and identifiers['PRODUCT_TYPE'] == "UDR"
    ):
        return formats.vex_vera.udr_table_special_position()
    return False, None

check_special_qube_band_storage(identifiers: DataIdentifiers)

Defines band storage types for QUBE procuts whose labels do not correctly specify them. Wraps get_qube_band_storage_type().

Source code in pdr/formats/checkers.py
1216
1217
1218
1219
1220
1221
1222
1223
def check_special_qube_band_storage(identifiers: DataIdentifiers):
    """
    Defines band storage types for QUBE procuts whose labels do not correctly
    specify them. Wraps `get_qube_band_storage_type()`.
    """
    if identifiers["INSTRUMENT_HOST_NAME"] == "CASSINI_ORBITER":
        return formats.cassini.get_special_qube_band_storage()
    return False, None

check_special_sample_type(identifiers: DataIdentifiers, base_samp_info: dict) -> tuple[bool, Optional[str]]

Preempt generic mapping of PDS3 data types to numpy dtype strings. Wraps image_sample_type(); called inline by insert_sample_types_into_df().

Source code in pdr/formats/checkers.py
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
def check_special_sample_type(
    identifiers: DataIdentifiers,
    base_samp_info: dict,
) -> tuple[bool, Optional[str]]:
    """
    Preempt generic mapping of PDS3 data types to numpy dtype strings. Wraps
    `image_sample_type()`; called inline by `insert_sample_types_into_df()`.
    """
    if (
        identifiers["DATA_SET_ID"] == "JNO-J-JIRAM-3-RDR-V1.0"
        and identifiers.get("PRODUCT_TYPE", "") == "RDR"
    ):
        return True, formats.juno.jiram_rdr_sample_type()
    if (
        identifiers["INSTRUMENT_ID"] == "LROC"
        and identifiers["PRODUCT_TYPE"] == "EDR"
    ):
        # unsigned integers not specified as such
        return True, formats.lroc.lroc_edr_sample_type()
    if (
        identifiers["DATA_SET_ID"] == "MGN-V-RDRS-5-GVDR-V1.0"
        and "GVANF" in identifiers["PRODUCT_ID"]
        and "N/A" in base_samp_info["SAMPLE_TYPE"]
    ):
        return True, formats.mgn.gvanf_sample_type()
    if identifiers["DATA_SET_ID"] == "LRO-L-CRAT-2-EDR-RAWDATA-V1.0":
        return formats.lro.crater_bit_col_sample_type(base_samp_info)
    if (
        identifiers["SPACECRAFT_NAME"] == "GALILEO_ORBITER"
        and "-NIMS-2-EDR-V1.0" in identifiers["DATA_SET_ID"]
    ):
        return formats.galileo.nims_edr_sample_type(base_samp_info)
    if (
        identifiers["DATA_SET_ID"] == "ULY-J-EPAC-4-SUMM-PHA-24HR-V1.0"
        and identifiers["PRODUCT_ID"].endswith("BIN")
    ):
        return formats.ulysses.get_sample_type(base_samp_info)
    if re.match(
        r"CO-(CAL-ISS|[S/EVJ-]+ISSNA/ISSWA-2)", identifiers["DATA_SET_ID"]
    ):
        return formats.cassini.line_prefix_sample_type(base_samp_info)
    return False, None

check_special_structure(name: str, block: MultiDict, fn: str, data: PDRLike, identifiers: DataIdentifiers) -> tuple[bool, Optional[tuple[pd.DataFrame, Optional[np.dtype]]]]

Preempt generic ARRAY/TABLE/SPREADSHEET format definition parsing. Wraps parse_array_structure() and parse_table_structure().

Source code in pdr/formats/checkers.py
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
def check_special_structure(
    name: str,
    block: MultiDict,
    fn: str,
    data: PDRLike,
    identifiers: DataIdentifiers
) -> tuple[bool, Optional[tuple[pd.DataFrame, Optional[np.dtype]]]]:
    """
    Preempt generic ARRAY/TABLE/SPREADSHEET format definition parsing. Wraps
    `parse_array_structure()` and `parse_table_structure()`.
    """
    if (
        identifiers["DATA_SET_ID"] == "CLEM1-L-RSS-5-BSR-V1.0"
        and name == "DATA_TABLE"
    ):
        # sequence wrapped as string for object names
        return True, formats.clementine.get_structure(
            block, name, fn, data, identifiers
        )
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "MARS GLOBAL SURVEYOR"
        and identifiers["INSTRUMENT_ID"] == "RSS"
        and identifiers["PRODUCT_TYPE"] == "ODF"
        and name == "ODF3B_TABLE"
    ):
        return True, formats.mgs.get_odf_structure(
            block, name, fn, data, identifiers
        )

    if (
        identifiers.get("INSTRUMENT_HOST_NAME") == "MARS GLOBAL SURVEYOR"
        and identifiers.get("INSTRUMENT_NAME") == "RADIO SCIENCE SUBSYSTEM"
        and identifiers.get("PRODUCT_TYPE") == "ECS"
    ):
        return True, formats.mgs.get_ecs_structure(
            block, name, fn, data, identifiers
        )
    # TODO: yikes
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "CASSINI ORBITER"
        and identifiers["INSTRUMENT_ID"] == "RPWS"
        and name == "TIME_SERIES"
    ) or (
        identifiers["INSTRUMENT_HOST_NAME"] == "HUYGENS PROBE"
        and (
            "HUY_DTWG_ENTRY_AERO" in fn
            or (
                "HASI" in data.metaget_("FILE_NAME", "")
                and "PWA" not in identifiers["FILE_NAME"]
            )
        )
    ):
        return True, formats.cassini.get_structure(
            block, name, fn, data, identifiers
        )
    if (
        re.match(
            r"GP-J-(NMS|ASI)-3-ENTRY-V1.0", identifiers["DATA_SET_ID"]
        )
        and name == "TABLE"
    ):
        return True, formats.galileo.probe_structure(
            block, name, fn, data, identifiers
        )
    if (
        identifiers["DATA_SET_ID"] == "GO-E-EPD-2-SAMP-PAD-V1.0"
        and identifiers["PRODUCT_ID"] == "E1PAD_7.TAB"
        and name == "TIME_SERIES"
    ):
        return True, formats.galileo.epd_structure(
            block, name, fn, data, identifiers
        )
    if (
        "VEGA" in identifiers["DATA_SET_ID"]
        and "-C-DUCMA-3-RDR-HALLEY-V1.0" in identifiers["DATA_SET_ID"]
        and name == "TABLE"
    ):
        return True, formats.vega.get_structure(
            block, name, fn, data, identifiers
        )
    if (
        "GIO-C-PIA-3-RDR-HALLEY-V1.0" == identifiers["DATA_SET_ID"]
        or re.match(r"VEGA.-C-PUMA.*", identifiers["DATA_SET_ID"])
    ) and name == "ARRAY":
        return True, formats.vega.fix_array_structure(
            name, block, fn, data, identifiers
        )
    if (
        re.match(r"MRO-M-MCS-(4-RDR|2-EDR)-V1.0", identifiers["DATA_SET_ID"])
        and name == "TABLE"
    ):
        return True, formats.mro.get_structure(
            block, name, fn, data, identifiers
        )
    if (
        identifiers["DATA_SET_ID"] == "VG2-SS-PLS-4-SUMM-1HR-AVG-V1.0"
        and name == "TABLE"
        and block["^STRUCTURE"] == "VGR_PLS_HR_2017.FMT"
    ):
        return True, formats.voyager.get_structure(
            block, name, fn, data, identifiers
        )
    if "IHW-C-SPEC-" in identifiers["DATA_SET_ID"] and name == "SPECTRUM":
        return True, formats.ihw.get_structure(
            block, name, fn, data, identifiers
        )
    if (
        identifiers["DATA_SET_ID"] == "PHX-M-MECA-2-NIEDR-V1.0"
        and name == "TBL_TABLE"
        and block["CONTAINER"]["^STRUCTURE"] == "TBL_0_STATE_DATA.FMT"
    ):
        return True, formats.phoenix.elec_em6_structure(
            block, name, fn, data, identifiers
        )
    if (
        identifiers["DATA_SET_ID"] == "PHX-M-MECA-4-NIRDR-V1.0"
        and identifiers["INSTRUMENT_ID"] == "MECA_AFM"
        and "HEADER_TABLE" in name
    ):
        return True, formats.phoenix.afm_rdr_structure(
            block, name, fn, data, identifiers
        )
    if (
        identifiers["DATA_SET_ID"] == "PHX-M-TEGA-2-LEDEDR-V1.0"
        and name == "TIME_SERIES"
        and block["^STRUCTURE"] == "TEGA_LED.FMT"
    ):
        return True, formats.phoenix.led_edr_structure(
            block, name, fn, data, identifiers
        )
    if (
        identifiers["DATA_SET_ID"] == "PHX-M-TEGA-4-SCRDR-V1.0"
        and name == "TIME_SERIES"
        and block["^STRUCTURE"] == "TEGA_SCRDR.FMT"
    ):
        return True, formats.phoenix.sc_rdr_structure(
            block, name, fn, data, identifiers
        )
    if (
        identifiers["DATA_SET_ID"] == "MEX-SUN-ASPERA3-4-SWM-V1.0"
        and name == "TABLE"
    ):
        return True, formats.mex.aspera_ima_ddr_structure(
            block, name, fn, data, identifiers
        )
    if (
        "-MIDAS-3-" in identifiers["DATA_SET_ID"]
        and "SPS" in identifiers["PRODUCT_ID"]
        and name == "TIME_SERIES"
    ):
        return True, formats.rosetta.midas_rdr_sps_structure(
            block, name, fn, data, identifiers
        )
    if (
        "-MIDAS-3-" in identifiers["DATA_SET_ID"]
        and "FSC" in identifiers["PRODUCT_ID"]
        and name == "FREQUENCY_SERIES"
    ):
        return True, formats.rosetta.fix_pad_length_structure(
            block, name, fn, data, identifiers
        )
    if (
        "ROSETTA" in identifiers["DATA_SET_NAME"] 
        and "CONSERT" in identifiers["DATA_SET_NAME"]
        and "TABLE" in name
        and name.startswith(("I_", "Q_"))
    ):
        if "GRNDBENCH" in identifiers["DATA_SET_ID"]:
            return False, None
        return True, formats.rosetta.fix_pad_length_structure(
            block, name, fn, data, identifiers
        )
    if (
        identifiers["SPACECRAFT_NAME"] == "GALILEO ORBITER"
        and identifiers["INSTRUMENT_NAME"] in ("SOLID_STATE_IMAGING", 
                                               "SOLID STATE IMAGING SYSTEM")
    ):
        return formats.galileo.ssi_redr_structure(
            block, name, fn, data, identifiers
        )
    if (
        identifiers["DATA_SET_ID"] == "ODY-M-GRS-2-EDR-V1.0"
        and identifiers["PRODUCT_TYPE"] == "E_KERNEL"
        and ".txt" in fn.lower()
    ):
        return formats.odyssey.grs_e_kernel_structure()
    if (
            identifiers["INSTRUMENT_HOST_NAME"] == "VENUS EXPRESS"
            and identifiers["INSTRUMENT_ID"] in ["VRA"]
            and identifiers['PRODUCT_TYPE'] == "UDR"
            and "ODF" not in fn
            and "TNF" not in fn
            and "RSR" not in fn
    ):
        return formats.vex_vera.udr_table_structure()

    return False, None

check_special_table_reader(identifiers: DataIdentifiers, name: str, fn: str, fmtdef_dt: tuple[pd.DataFrame, np.dtype], block: MultiDict, start_byte: int)

Preempt loaders.datawrap.ReadTable's dispatch to read_table().

Source code in pdr/formats/checkers.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
def check_special_table_reader(
    identifiers: DataIdentifiers,
    name: str,
    fn: str,
    fmtdef_dt: tuple[pd.DataFrame, np.dtype],
    block: MultiDict,
    start_byte: int,
):
    """Preempt loaders.datawrap.ReadTable's dispatch to `read_table()`."""
    if identifiers["DATA_SET_ID"] in (
        "CO-S-MIMI-4-CHEMS-CALIB-V1.0",
        "CO-S-MIMI-4-LEMMS-CALIB-V1.0",
        "CO-S-MIMI-4-INCA-CALIB-V1.0",
        "CO-E/J/S/SW-MIMI-2-LEMMS-UNCALIB-V1.0",
        "CO-SSA-RADAR-3-ABDR-SUMMARY-V1.0",
    ):
        return True, formats.cassini.spreadsheet_loader(
            fn, fmtdef_dt, identifiers["DATA_SET_ID"]
        )
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "CASSINI ORBITER"
        and identifiers["PRODUCT_TYPE"] == "ANCILLARY"
        and "CO-S-MIMI-4-" in identifiers["DATA_SET_ID"]
    ):
        return True, formats.cassini.spreadsheet_loader(
            fn, fmtdef_dt, identifiers["DATA_SET_ID"]
        )
    if identifiers["INSTRUMENT_ID"] == "CHEMIN" and ("SPREADSHEET" in name):
        # mangled object names + positions
        return True, formats.msl_cmn.spreadsheet_loader(fn)
    if (
        "MSL-M-SAM-" in identifiers["DATA_SET_ID"]
        and "QMS" in identifiers["PRODUCT_ID"]
        and "TABLE" in name
    ):
        # reusing the msl_cmn special case for msl_sam qms tables
        return True, formats.msl_cmn.spreadsheet_loader(fn)
    if (
        identifiers["DATA_SET_ID"] == "MSL-M-ROVER-6-RDR-PLACES-V1.0"
        and name == "SPREADSHEET"
    ):
        return True, formats.msl_places.spreadsheet_loader(fn, fmtdef_dt)
    if (
        identifiers["INSTRUMENT_NAME"]
        == "ROSETTA PLASMA CONSORTIUM - MUTUAL IMPEDANCE "
        "PROBE"
        and "SPECTRUM_TABLE" in name
    ):
        return True, formats.rosetta.rosetta_table_loader(fn, fmtdef_dt)
    if (
        identifiers["SPACECRAFT_NAME"] == "MAGELLAN"
        and name == "TABLE"
        and identifiers["NOTE"].startswith("Geometry")
    ) or (
        identifiers["DATA_SET_ID"] == "GO-J-NIMS-4-ADR-SL9IMPACT-V1.0"
        and name == "TABLE"
        and (
            "CAL_DATA.TAB" in identifiers["PRODUCT_ID"]
            or "G_DATA.TAB" in identifiers["PRODUCT_ID"]
            or "R_DATA.TAB" in identifiers["PRODUCT_ID"]
        )
    ):
        return True, formats.mgn.geom_table_loader(fn, fmtdef_dt)
    if (
        identifiers["DATA_SET_ID"].startswith("MGN-V-RSS-5-OCC-PROF")
        and name == "TABLE"
    ):
        return True, formats.mgn.occultation_loader(
            identifiers, fmtdef_dt, block, fn
        )
    if (
        identifiers["INSTRUMENT_ID"] == "DLRE"
        and identifiers["PRODUCT_TYPE"] in ("GCP", "PCP", "PRP")
        and name == "TABLE"
    ):
        return True, formats.diviner.diviner_l4_table_loader(fmtdef_dt, fn)
    if (
        identifiers["DATA_SET_ID"] == "GO-J-PWS-5-DDR-PLASMA-DENSITY-FULL-V1.0"
        and name == "SPREADSHEET"
    ):
        return True, formats.galileo.pws_table_loader(fn, fmtdef_dt)
    if (
        identifiers["DATA_SET_ID"] == "ODY-M-GRS-5-ELEMENTS-V1.0"
        and name == "TABLE"
    ):
        return True, formats.odyssey.map_table_loader(fn, fmtdef_dt)
    if (
        identifiers["DATA_SET_ID"] == "ULY-J-GAS-5-SKY-MAPS-V1.0"
        and name == "TABLE"
        and block["^STRUCTURE"] == "GASDATA.FMT"
    ):
        return True, formats.ulysses.gas_table_loader(fn, fmtdef_dt)
    if (
        "MRO-M-MCS-5-DDR" in identifiers["DATA_SET_ID"]
        and "V1.0" not in identifiers["DATA_SET_ID"]
        and name == "TABLE"
    ):
        return True, formats.mro.mcs_ddr_table_loader(
            block, fn, start_byte
        )
    if (
        identifiers["INSTRUMENT_ID"] == "CRISM"
        and identifiers["PRODUCT_TYPE"] == "ANCILLARY"
        and "OBS" in identifiers["PRODUCT_ID"]
        and name == "TABLE"
    ):
        return True, formats.mro.ancil_table_loader(fn, fmtdef_dt)
    if (
        identifiers["DATA_SET_ID"] == "IHW-C-IRFCURV-3-EDR-HALLEY-V2.0"
        and name == "TABLE"
    ):
        return True, formats.ihw.curve_table_loader(fn, fmtdef_dt)
    if (
        identifiers["DATA_SET_ID"]
        in (
            "IHW-C-PPFLX-3-RDR-HALLEY-V1.0",
            "IHW-C-PPOL-3-RDR-HALLEY-V1.0",
            "IHW-C-PPSTOKE-3-RDR-HALLEY-V1.0",
            "IHW-C-PPMAG-3-RDR-HALLEY-V1.0",
            "IHW-C-MSNRDR-3-RDR-HALLEY-ETA-AQUAR-V1.0",
            "IHW-C-MSNRDR-3-RDR-HALLEY-ORIONID-V1.0",
            "IHW-C-MSNVIS-3-RDR-HALLEY-ETA-AQUAR-V1.0",
            "IHW-C-MSNVIS-3-RDR-HALLEY-ORIONID-V1.0",
            "IHW-C-IRFTAB-3-RDR-HALLEY-V1.0",
            "IHW-C-IRPOL-3-RDR-HALLEY-V1.0",
            "IHW-C-IRPHOT-3-RDR-HALLEY-V1.0",
        )
        and name == "TABLE"
    ):
        return True, formats.ihw.add_newlines_table_loader(
            fmtdef_dt, block, fn, start_byte
        )
    if (
        identifiers["DATA_SET_ID"] == "VG1-J-LECP-4-SUMM-SECTOR-15MIN-V1.1"
        and name == "TABLE"
    ):
        return True, formats.voyager.lecp_table_loader(fn, fmtdef_dt)
    if (
        identifiers["DATA_SET_ID"] == "VG2-U-LECP-4-RDR-STEP-12.8MIN-V1.0"
        and block["INTERCHANGE_FORMAT"] == "ASCII"
        and name == "TABLE"
    ):
        return True, formats.voyager.lecp_table_loader(fn, fmtdef_dt)
    if (
        identifiers["DATA_SET_ID"] == "VG1-S-LECP-3-RDR-STEP-6MIN-V1.0"
        and name == "SPREADSHEET"
    ):
        return True, formats.voyager.lecp_vg1_sat_table_loader(fn, fmtdef_dt)
    if identifiers["DATA_SET_ID"] == "VL2-M-SEIS-5-RDR-V1.0" and name in (
        "TABLE",
        "SPREADSHEET",
    ):
        return True, formats.viking.seis_table_loader(fn, fmtdef_dt)
    if (
        "MEX-M-ASPERA3-2-EDR-IMA" in identifiers["DATA_SET_ID"]
        and name == "SPREADSHEET"
    ):
        return True, formats.mex.aspera_table_loader(fn, fmtdef_dt)
    if (
        re.match(r"MER[12]-M-RSS-1-EDR-V1.0", identifiers["DATA_SET_ID"])
        and identifiers["PRODUCT_TYPE"] == "UHFD"
        and name == "SPREADSHEET"
    ):
        return True, formats.mer.rss_spreadsheet_loader(fn, fmtdef_dt)
    if (
        identifiers["DATA_SET_ID"] == "PHX-M-MECA-4-NIRDR-V1.0"
        and identifiers["INSTRUMENT_ID"] == "MECA_AFM"
        and "TABLE" in name
    ):
        return True, formats.phoenix.afm_table_loader(fn, fmtdef_dt, name)
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "MARS EXPRESS"
        and identifiers["INSTRUMENT_ID"] == "MRS"
        and all(x in identifiers["PRODUCT_ID"] for x in ["ODF", "L1B"])
        and "TABLE" in name
    ):
        return True, formats.mex.mrs_l1b_odf_table_loader(fn, fmtdef_dt)
    if (
        identifiers["DATA_SET_ID"] == "LRO-L-MRFLRO-1-PDR-V1.0"
        and "HK_" in identifiers["PRODUCT_ID"]
        and name == "SPREADSHEET"
    ):
        return True, formats.lro.mini_rf_spreadsheet_loader(fn, fmtdef_dt)
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "MARS SCIENCE LABORATORY"
        and identifiers["INSTRUMENT_ID"] == "REMS"
        and "TABLE" in name
        and "SP_____" in identifiers["PRODUCT_ID"]
    ):
        return True, formats.msl_rems.edr_table_loader(
            fn, fmtdef_dt, block, start_byte
        )
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "MARS SCIENCE LABORATORY"
        and identifiers["INSTRUMENT_ID"] == "REMS"
        and name == "REMS_SCIENCE_TABLE"
        and "RDR" in identifiers["DATA_SET_ID"]
    ):
        return True, formats.msl_rems.rdr_table_loader(fn, fmtdef_dt)
    if (
        all(x in identifiers["DATA_SET_ID"] for x in ["ICE-C-", "-3-RDR-"])
        and "TRAJ_ICE" in fn
        and name == "TABLE"
    ):
        return True, formats.ihw.add_newlines_table_loader(
            fmtdef_dt, block, fn, start_byte
        )
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "LUNAR RECONNAISSANCE ORBITER"
        and identifiers["INSTRUMENT_NAME"] == "RADIO SCIENCE SUBSYSTEM"
        and name == "WEAREC_TABLE"
    ):
        return True, formats.lro.wea_table_loader(fn, fmtdef_dt)
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "LUNAR PROSPECTOR"
        and identifiers["PRODUCT_ID"] == "OUTAGES"
        and name == "TABLE"
    ):
        return True, formats.lp.ancillary_table_loader(fn, fmtdef_dt)
    if (
        identifiers["DATA_SET_ID"] == "ODY-M-GRS-2-EDR-V1.0"
        and identifiers["PRODUCT_TYPE"] == "E_KERNEL"
        and ".txt" in fn.lower()
    ):
        return formats.odyssey.grs_e_kernel_loader(name, fn)
    if (
        identifiers["INSTRUMENT_HOST_NAME"] == "VENUS EXPRESS"
        and identifiers["INSTRUMENT_ID"] in ["VRA"]
        and identifiers['PRODUCT_TYPE'] == "UDR"
        and ".RAW" in identifiers["PRODUCT_ID"]

    ):
        return True, formats.vex_vera.udr_table_loader(fn)
    # this doesn't exist yet
    # if (
    #     identifiers["INSTRUMENT_ID"] == "MOLA"
    #     and ".B" in identifiers["FILE_NAME"]
    #     and "AEDR" in identifiers["DATA_SET_ID"]
    # ):
    #     return True, formats.mgs.aedr_table_loader(fn, name, fmtdef_dt, block,
    #                                                start_byte,
    #                                                identifiers['RECORD_BYTES'])
    if (
        identifiers["INSTRUMENT_ID"] == "SPEDE"
        and "PD_40_" in identifiers["FILE_NAME"]
        and ".TAB" in identifiers["FILE_NAME"]
    ):
        return True, formats.smart1_esa.spede_plasma40_table_reader(fn)

    return False, None

check_trivial_case(pointer: str, identifiers: DataIdentifiers, fn: str) -> bool

Supplement generic definition of 'trivial' pointers. Intended primarily to preempt attempts to load known-unsupported data objects associated with otherwise-supported products. Called inline by pointer_to_loader().

Source code in pdr/formats/checkers.py
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
def check_trivial_case(pointer: str, identifiers: DataIdentifiers, fn: str) -> bool:
    """
    Supplement generic definition of 'trivial' pointers. Intended primarily to
    preempt attempts to load known-unsupported data objects associated with
    otherwise-supported products. Called inline by `pointer_to_loader()`.
    """
    if is_trivial(pointer):
        return True
    if (
        identifiers["INSTRUMENT_ID"] == "APXS"
        and "ERROR_CONTROL_TABLE" in pointer
    ):
        return formats.msl_apxs.table_loader(pointer)
    if (
        identifiers["INSTRUMENT_NAME"] == "TRIAXIAL FLUXGATE MAGNETOMETER"
        and pointer == "TABLE"
        and "-EDR-" in identifiers["DATA_SET_ID"]
    ):
        return formats.galileo.galileo_table_loader()
    if (
        identifiers["INSTRUMENT_NAME"]
        == "CHEMISTRY CAMERA REMOTE MICRO-IMAGER"
        and pointer == "IMAGE_REPLY_TABLE"
    ):
        return formats.msl_ccam.image_reply_table_loader()
    if identifiers["DATA_SET_ID"].startswith("ODY-M-THM-5") and (
        pointer in ("HEADER", "HISTORY")
    ):
        return formats.themis.trivial_themis_geo_loader(pointer)
    # if re.match(
    #     r"CO-(CAL-ISS|[S/EVJ-]+ISSNA/ISSWA-2)", identifiers["DATA_SET_ID"]
    # ):
    #     if pointer == "LINE_PREFIX_TABLE":
    #         return formats.cassini.trivial_loader(pointer)
    if (
        identifiers["DATA_SET_ID"] == "CO-CAL-ISS-2-V1.0"
        and pointer in ("TELEMETRY_TABLE",
                        "LINE_PREFIX_TABLE")
        and identifiers["FILE_RECORDS"] == 1025
    ):
        return formats.cassini.iss_cal_trivial_loader(pointer)
    if (
        identifiers["SPACECRAFT_NAME"] == "MAGELLAN"
        and (fn.endswith(".img") or fn.endswith(".ibg"))
        and pointer == "TABLE"
    ):
        return formats.mgn.orbit_table_in_img_loader()
    if (
        "GO-A-SSI-3-" in identifiers["DATA_SET_ID"]
        and "-CALIMAGES-V1.0" in identifiers["DATA_SET_ID"]
        and "QUB" in identifiers["PRODUCT_ID"]
        and pointer == "HEADER"
    ):
        return formats.galileo.ssi_cubes_header_loader()
    if identifiers["INSTRUMENT_ID"] == "CHEMIN" and (pointer == "HEADER"):
        return formats.msl_cmn.trivial_header_loader()
    if "MSL-M-SAM-" in identifiers["DATA_SET_ID"] and "FILE" in pointer:
        # reusing the msl_cmn special case for msl_sam 'FILE' pointers
        return formats.msl_cmn.trivial_header_loader()
    if (
        identifiers["INSTRUMENT_ID"] == "NIMS"
        and identifiers["SPACECRAFT_NAME"] == 'GALILEO_ORBITER'
        and pointer == "SAMPLE_SPECTRUM_QUBE"
    ):
        return formats.galileo.nims_sample_spectral_qube_trivial_loader()
    if (
        identifiers["DATA_SET_ID"] == "BUGLAB-L-BUG-4-APOLLO-SAMPLES-V1.0"
        and pointer == "HEADER"
    ):
        return formats.ground.trivial_header_loader()
    if (
        identifiers["DATA_SET_ID"] == "MSL-M-APXS-4/5-RDR-V1.0"
        and pointer == "HEADER"
    ):
        return formats.msl_apxs.trivial_header_loader()
    if (
            identifiers["INSTRUMENT_HOST_NAME"] == "VENUS EXPRESS"
            and identifiers["INSTRUMENT_ID"] in ["VRA"]
            and "1A_TNF" in identifiers["PRODUCT_ID"]
    ):
        return formats.vex_vera.trvial_dsn_table()
    if (
            identifiers["INSTRUMENT_HOST_NAME"] == "VENUS EXPRESS"
            and identifiers["INSTRUMENT_ID"] in ["VIRTIS"]
            and pointer == "HISTORY"
    ):
        return formats.vex_virtis.trivial_history()
    if (
        "MRO-M-MCS-5-DDR" in identifiers["DATA_SET_ID"]
        and "V1.0" in identifiers["DATA_SET_ID"]
    ):
        return formats.mro.mcs_ddr_oldformat_trivial()
    if (
        identifiers["DATA_SET_ID"] == 'SLN-L-PACE-3-PBF1-V3.0'
        and pointer == 'TIME_SERIES'
    ):
        return formats.kaguya.pace_time_series_trivial()
    if (
            "SLN-L-SP" in identifiers['DATA_SET_ID']
            and "L2D_RESULT_ARRAY" in pointer
    ):
        return formats.kaguya.sp_l2d_result_array_trivial()
    if (
        identifiers['DATA_SET_ID'] == "SLN-L-TC-4-DEM-ORTHO-V1.0"
        and pointer == "QA_FILENAME"
    ):
        return formats.kaguya.sp_tc_filename_pointer_trivial()
    if (
        identifiers['DATA_SET_ID'] == "SLN-L-GRS-3-ENG-SPECTRUM-V1.0"
        and pointer == "TABLE"
    ):
        return formats.kaguya.grs_eng_tables_trivial()
    if (
        "CE3_BMYK_VNIS-CC_SCI_" in fn
        and pointer == "CAL_TARGET_DATA"
    ):
        return formats.change.cal_target_data_trivial()
    if (
        identifiers['SPACECRAFT_NAME'] == 'CE1'
        and "IMAGE_PREFIX" in pointer
    ):
        return formats.change.image_prefix_trivial()
    return False

special_image_constants(identifiers: DataIdentifiers) -> dict[str, int]

Defines 'secret' special constants for a dataset or product type. Called inline by Data.find_special_constants().

Source code in pdr/formats/checkers.py
1147
1148
1149
1150
1151
1152
1153
1154
1155
def special_image_constants(identifiers: DataIdentifiers) -> dict[str, int]:
    """
    Defines 'secret' special constants for a dataset or product type. Called
    inline by `Data.find_special_constants()`.
    """
    consts = {}
    if identifiers["INSTRUMENT_ID"] == "CRISM":
        consts["NULL"] = 65535
    return consts

specialblock(data: PDRLike, name: str)

Special-purpose wrapper for check_special_block() intended for use outside of the query workflow.

Source code in pdr/formats/checkers.py
804
805
806
807
808
809
810
811
812
def specialblock(data: PDRLike, name: str):
    """
    Special-purpose wrapper for check_special_block() intended for use
    outside of the query workflow.
    """
    is_special, block = check_special_block(name, data, data.identifiers)
    if is_special is True:
        return block
    return data.metablock_(name)

formats.clementine

get_fn(data, object_name)

HITS * clem_GEO * bsr_rdr_data

Source code in pdr/formats/clementine.py
18
19
20
21
22
23
24
25
def get_fn(data, object_name):
    """
    HITS
    * clem_GEO
        * bsr_rdr_data
    """
    target = re.split(r",|[(|)]", data.metaget(f"^{object_name}"))[1]
    return True, target

get_offset(data, pointer)

HITS * clem_GEO * bsr_rdr_data

Source code in pdr/formats/clementine.py
 8
 9
10
11
12
13
14
15
def get_offset(data, pointer):
    """
    HITS
    * clem_GEO
        * bsr_rdr_data
    """
    start_row = int(re.split(r",|[(|)]", data.metaget(f"^{pointer}"))[2])
    return True, (start_row - 1) * data.metaget("RECORD_BYTES")

get_structure(block, name, filename, data, identifiers)

HITS: * clem_GEO * bsr_rdr_data

Source code in pdr/formats/clementine.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def get_structure(block, name, filename, data, identifiers):
    """
    HITS:
    * clem_GEO
        * bsr_rdr_data
    """
    fmtdef = pdr.loaders.queries.read_table_structure(
        block, name, filename, data, identifiers
    )
    import numpy as np
    import pandas as pd

    fmtdef = pd.concat([fmtdef, fmtdef], ignore_index=True)
    fmtdef["NAME"] = fmtdef["NAME"].str.split("_", expand=True)[0]
    fmtdef["NAME"] = fmtdef["NAME"].str.cat(map(str, fmtdef.index), sep="_")
    fmtdef.ITEM_OFFSET = 8
    fmtdef.ITEM_BYTES = 8
    from pdr.loaders.queries import _fill_empty_byte_rows
    from pdr.pd_utils import insert_sample_types_into_df

    fmtdef['BYTES'] = np.nan
    fmtdef = _fill_empty_byte_rows(fmtdef)
    fmtdef, dt = insert_sample_types_into_df(fmtdef, identifiers)
    return fmtdef, dt

formats.dawn

DoesNotExistError

Bases: Exception

Source code in pdr/formats/dawn.py
3
4
5
class DoesNotExistError(Exception):
    """"""
    pass

dawn_history_hdu_exception()

filter out spurious HISTORY pointer

HITS * dawn * fc_edr_fit * fc_rdr_fit

Source code in pdr/formats/dawn.py
 8
 9
10
11
12
13
14
15
16
17
18
19
def dawn_history_hdu_exception():
    """
    filter out spurious HISTORY pointer

    HITS
    * dawn
        * fc_edr_fit
        * fc_rdr_fit
    """
    raise DoesNotExistError(
        "Dawn FITS HISTORY extensions do not actually exist."
    )

formats.diviner

diviner_l4_table_loader(fmtdef_dt, filename)

because these can contain the value "NaN", combined with the fact that they are space-padded, pd.read_csv sometimes casts some columns to object, turning some of their values into strings and some into float, throwing warnings and making it obnoxious to work with them (users will randomly not be able to, e.g., add two columns together without a data cleaning step).

HITS * diviner * l4

Source code in pdr/formats/diviner.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
def diviner_l4_table_loader(fmtdef_dt, filename):
    """
    because these can contain the value "NaN", combined with the fact that they
    are space-padded, pd.read_csv sometimes casts some columns to object,
    turning some of their values into strings and some into float, throwing
    warnings and making it obnoxious to work with them (users will randomly not
    be able to, e.g., add two columns together without a data cleaning step).

    HITS
    * diviner
        * l4
    """
    import numpy as np
    import pandas as pd

    table = pd.DataFrame(
        np.loadtxt(filename, delimiter=",", skiprows=1),
        columns=[c for c in fmtdef_dt[0]["NAME"] if "PLACEHOLDER" not in c],
    )
    return table

formats.epoxi

cart_model_get_position(identifiers, block, target, name, start_byte)

The cartesian shape model's RECORD_BYTES and all three of the tables' ROW_BYTES should be 79 but the label lists them as 80.

HITS * epoxi * shape

Source code in pdr/formats/epoxi.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
def cart_model_get_position(identifiers, block, target, name, start_byte):
    """
    The cartesian shape model's RECORD_BYTES and all three of the tables'
    ROW_BYTES should be 79 but the label lists them as 80.

    HITS
    * epoxi
        * shape
    """
    table_props = table_position(identifiers, block, target, name, start_byte)
    row_bytes = 79
    table_props["start"] = row_bytes * (target[1] - 1)
    table_props["length"] = row_bytes * block["ROWS"]
    return table_props

hriv_deconv_mask_start_byte(name, hdulist)

The EPOXI HRIV deconvolved radiance files have incorrect start byte specifications for the MASK HDU.

HITS * epoxi * hriv_deconvolved

Source code in pdr/formats/epoxi.py
22
23
24
25
26
27
28
29
30
31
32
33
def hriv_deconv_mask_start_byte(name, hdulist):
    """
    The EPOXI HRIV deconvolved radiance files have incorrect start byte
    specifications for the MASK HDU.

    HITS
    * epoxi
        * hriv_deconvolved
    """
    if 'HEADER' in name:
        return hdulist.fileinfo('MASK')['hdrLoc']
    return hdulist.fileinfo('MASK')['datLoc']

formats.galileo

epd_special_block(data, name)

All 'E1' EPD SUMM products incorrectly say ROW_BYTES = 90; changing them to the RECORD_BYTES values.

HITS * gal_particles * epd_summ (partial)

Source code in pdr/formats/galileo.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
def epd_special_block(data, name):
    """
    All 'E1' EPD SUMM products incorrectly say ROW_BYTES = 90; changing them
    to the RECORD_BYTES values.

    HITS
    * gal_particles
        * epd_summ (partial)
    """
    block = data.metablock_(name)
    block["ROW_BYTES"] = data.metaget_("RECORD_BYTES")
    return block

epd_structure(block, name, filename, data, identifiers)

E1PAD_7.TAB has an extra/unaccounted for byte at the start of each row

HITS * gal_particles * epd_samp (partial)

Source code in pdr/formats/galileo.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def epd_structure(block, name, filename, data, identifiers):
    """
    E1PAD_7.TAB has an extra/unaccounted for byte at the start of each row

    HITS
    * gal_particles
        * epd_samp (partial)
    """
    fmtdef = pdr.loaders.queries.read_table_structure(
        block, name, filename, data, identifiers
    )
    for row in range(0, 9):
        fmtdef.at[row, "START_BYTE"] += 1
    return fmtdef, None

galileo_table_loader()

Source code in pdr/formats/galileo.py
28
29
30
31
def galileo_table_loader():
    """"""
    warnings.warn("Galileo EDR binary tables are not yet supported.")
    return True

mdis_fits_start_byte(name: str, hdulist: HDUList) -> int

The MDIS cal labels do not include accurate offsets for data objects. (There's also an additional HDU they don't label as a PDS object at all!)

HITS * messenger_grnd_cal * mdis

Source code in pdr/formats/galileo.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
def mdis_fits_start_byte(name: str, hdulist: HDUList) -> int:
    """
    The MDIS cal labels do not include accurate offsets for data objects.
    (There's also an additional HDU they don't label as a PDS object at all!)

    HITS
    * messenger_grnd_cal
        * mdis
    """
    if name not in ("IMAGE", "HEADER"):
        raise NotImplementedError("Unknown MDIS extension name")
    if name == "HEADER":
        return 0
    return hdulist.fileinfo(0)['datLoc']

nims_edr_sample_type(base_samp_info)

Each time byte order is specified for these products it is LSB, so this assumes BIT_STRING refers to LSB_BIT_STRING. N/A samples are read as CHARACTER

HITS * gal_nims * pre_jup

Source code in pdr/formats/galileo.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
def nims_edr_sample_type(base_samp_info):
    """
    Each time byte order is specified for these products it is LSB, so this
    assumes BIT_STRING refers to LSB_BIT_STRING. N/A samples are read as
    CHARACTER

    HITS
    * gal_nims
        * pre_jup
    """
    from pdr.datatypes import sample_types

    sample_type = base_samp_info["SAMPLE_TYPE"]
    sample_bytes = base_samp_info["BYTES_PER_PIXEL"]
    if "BIT_STRING" == sample_type:
        sample_type = "LSB_BIT_STRING"
        return True, sample_types(
            sample_type, int(sample_bytes), for_numpy=True
        )
    if "N/A" in sample_type:
        sample_type = "CHARACTER"
        return True, sample_types(
            sample_type, int(sample_bytes), for_numpy=True
        )
    return False, None

nims_sample_spectral_qube_trivial_loader()

HITS * gal_nims *cube

Source code in pdr/formats/galileo.py
171
172
173
174
175
176
177
178
179
def nims_sample_spectral_qube_trivial_loader():
    """
    HITS
    * gal_nims
        *cube
    """
    warnings.warn('Galileo NIMS SAMPLE_SPECTRUM_QUBE objects are not supported'
                  'due to their use of nibble pixels.')
    return True

probe_structure(block, name, filename, data, identifiers)

Several NMS products have an incorrect BYTES value in one column. One ASI product has incorrect BYTES values in multiple columns

HITS * gal_probe * asi * nms

Source code in pdr/formats/galileo.py
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
def probe_structure(block, name, filename, data, identifiers):
    """
    Several NMS products have an incorrect BYTES value in one column.
    One ASI product has incorrect BYTES values in multiple columns

    HITS
    * gal_probe
        * asi
        * nms
    """
    fmtdef = pdr.loaders.queries.read_table_structure(
        block, name, filename, data, identifiers
    )
    # Several NMS products have an incorrect BYTES value in one column
    if fmtdef.at[1, "NAME"] == "COUNTS":
        fmtdef.at[1, "BYTES"] = 8
    # One ASI product has incorrect BYTES values in multiple columns
    elif identifiers["PRODUCT_ID"] == "HK01AD.TAB":
        fmtdef.at[1, "BYTES"] = 4
        fmtdef.at[3, "BYTES"] = 2
        fmtdef.at[5, "BYTES"] = 2
    return fmtdef, None

pws_special_block(data, name)

The PWS SUMM products sometimes undercount ROW_BYTES by 2

HITS * gal_plasma * pws_summ * vg_pws * jup_summ * sat_summ * sys_summ_vg1 * sys_summ_vg2 * sys_ancillary * ur_rdr_bin * ur_rdr_asc * ur_summ_bin * ur_summ_asc * newp_summ_bin * nep_summ_asc

Source code in pdr/formats/galileo.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def pws_special_block(data, name):
    """
    The PWS SUMM products sometimes undercount ROW_BYTES by 2

    HITS
    * gal_plasma
        * pws_summ
    * vg_pws
        * jup_summ
        * sat_summ
        * sys_summ_vg1
        * sys_summ_vg2
        * sys_ancillary
        * ur_rdr_bin
        * ur_rdr_asc
        * ur_summ_bin
        * ur_summ_asc
        * newp_summ_bin
        * nep_summ_asc
    """
    block = data.metablock_(name)
    product_id = data.metaget_("PRODUCT_ID")
    if "B.TAB" in product_id:
        block["ROW_BYTES"] = 366
    if "E.TAB" in product_id:
        block["ROW_BYTES"] = 516
    return block

pws_table_loader(filename, fmtdef_dt)

HITS * gal_plasma * pws_ddr

Source code in pdr/formats/galileo.py
156
157
158
159
160
161
162
163
164
165
166
167
168
def pws_table_loader(filename, fmtdef_dt):
    """
    HITS
    * gal_plasma
        * pws_ddr
    """
    import pandas as pd

    fmtdef, dt = fmtdef_dt
    table = pd.read_csv(filename, header=1, sep=";")
    assert len(table.columns) == len(fmtdef.NAME.tolist())
    table.columns = fmtdef.NAME.tolist()
    return table

ssi_cubes_header_loader()

The Ida and Gaspra cubes have HEADER pointers but no defined HEADER objects

HITS * gal_ssi * sb_cube

Source code in pdr/formats/galileo.py
34
35
36
37
38
39
40
41
42
43
def ssi_cubes_header_loader():
    """
    The Ida and Gaspra cubes have HEADER pointers but no defined HEADER
    objects

    HITS
    * gal_ssi
        * sb_cube
    """
    return True

ssi_prefix_block(data, name)

These are binary tables, but the format file has one column with "DATA_TYPE = ASCII_REAL". This special case changes it to CHARACTER because the column's DESCRIPTION calls it a "Real number represented as an ascii string in the form 123.12"

HITS * gal_ssi * redr_late

Source code in pdr/formats/galileo.py
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
def ssi_prefix_block(data, name):
    """
    These are binary tables, but the format file has one column with "DATA_TYPE 
    = ASCII_REAL". This special case changes it to CHARACTER because the 
    column's DESCRIPTION calls it a "Real number represented as an ascii string 
    in the form 123.12"

    HITS
    * gal_ssi
        * redr_late
    """
    block = pdr.loaders.queries.get_block(data, name)
    for item in iter(block.items()):
        if (
            "COLUMN" in item
            and item[1]["NAME"] == "COMPRESSION_RATIO" 
            and "ASCII" in item[1]["DATA_TYPE"]
        ):
                item[1]["DATA_TYPE"] = "CHARACTER"
    return block

ssi_redr_bit_col_format(definition)

Some of the bit columns defined in the Galileo SSI telemetry and line prefix table format files have multiple items, but their ITEM_BITS are mislabled as BITS.

HITS: * gal_ssi * redr_early * redr_mid * redr_late * sl9_jupiter_impact * go_ssi

Source code in pdr/formats/galileo.py
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
def ssi_redr_bit_col_format(definition):
    """
    Some of the bit columns defined in the Galileo SSI telemetry and line 
    prefix table format files have multiple items, but their ITEM_BITS are 
    mislabled as BITS.

    HITS:
    * gal_ssi
        * redr_early
        * redr_mid
        * redr_late
    * sl9_jupiter_impact
        * go_ssi
    """
    # fix `definition` in-place
    for column in iter(definition.items()):
        if "BIT_COLUMN" in column:
            if "ITEMS" in column[1] and "ITEM_BITS" not in column[1]:
                if "BITS" in column[1]:
                    column[1].add("ITEM_BITS", column[1]["BITS"])
                else:
                    column[1].add("ITEM_BITS", 1)
    # return nothing because nothing modifies `obj`
    return False, None

ssi_redr_prefix_fn(data)

For the early-mission (volumes go_0002-go_0006) SSI REDR line prefix tables. Calling pdr.read() on the .lbl file instead of the .img outputs a different table; it tries to populate with data from the label. TODO: Keep an eye out for more under specified line prefix tables with this issue, in case it is more comman than just a few special cases

HITS * gal_ssi * redr_early

Source code in pdr/formats/galileo.py
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
def ssi_redr_prefix_fn(data):
    """
    For the early-mission (volumes go_0002-go_0006) SSI REDR line prefix tables.
    Calling pdr.read() on the .lbl file instead of the .img outputs a different 
    table; it tries to populate with data from the label. 
    TODO: Keep an eye out for more under specified line prefix tables with this 
    issue, in case it is more comman than just a few special cases

    HITS
    * gal_ssi
        * redr_early
    """
    target = data.filename
    target = target.replace(".lbl",".img")
    target = target.replace(".LBL",".IMG")
    return True, target

ssi_redr_structure(block, name, filename, data, identifiers)

Similar to the ssi_redr_bit_col_format() special case above. Columns with multiple ITEMS in the telemetry and line prefix table format files define BYTES but leave out ITEM_BYTES.

HITS * gal_ssi * redr_early * redr_mid * redr_late * sl9_jupiter_impact * go_ssi

Source code in pdr/formats/galileo.py
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
def ssi_redr_structure(block, name, filename, data, identifiers):
    """
    Similar to the ssi_redr_bit_col_format() special case above. Columns with 
    multiple ITEMS in the telemetry and line prefix table format files define 
    BYTES but leave out ITEM_BYTES.

    HITS
    * gal_ssi
        * redr_early
        * redr_mid
        * redr_late
    * sl9_jupiter_impact
        * go_ssi
    """
    from pdr.pd_utils import insert_sample_types_into_df, compute_offsets
    import math

    fmtdef = pdr.loaders.queries.read_table_structure(
        block, name, filename, data, identifiers
    )
    if "ITEMS" in fmtdef:
        fmtdef["ITEM_BYTES"] = None
        # the line prefix tables have row suffix bytes (telemetry tables do not)
        if "ROW_SUFFIX_BYTES" in block:
            fmtdef["ROW_SUFFIX_BYTES"] = block["ROW_SUFFIX_BYTES"]
        # columns with ITEMS in the format file mislabel ITEM_BYTES as BYTES
        for row in range(0,len(fmtdef)):
            if not math.isnan(fmtdef.at[row, "ITEMS"]):
                fmtdef.at[row, "ITEM_BYTES"] = fmtdef.at[row, "BYTES"]
        fmtdef = compute_offsets(fmtdef)
        return True, insert_sample_types_into_df(fmtdef, identifiers)
    return False, None

formats.ground

ebrocc_geom_get_position(identifiers, block, target, name, start_byte)

ROW_BYTES = 45 in the labels, but it should be 47

HITS * ground_based * ring_occ_1989_geometry

Source code in pdr/formats/ground.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def ebrocc_geom_get_position(identifiers, block, target, name, start_byte):
    """
    ROW_BYTES = 45 in the labels, but it should be 47

    HITS
    * ground_based
        * ring_occ_1989_geometry
    """
    from pdr.loaders.queries import table_position

    table_props = table_position(identifiers, block, target, name, start_byte)
    n_rows = block["ROWS"]
    row_bytes = block["ROW_BYTES"] + 2
    table_props["length"] = n_rows * row_bytes
    return table_props

mssso_cal_start_byte(name, hdulist)

A small subset of MSSSO CASPIR calibration images have the wrong start byte for the IMAGE pointer in their PDS3 labels

HITS * sl9_jupiter_impact * mssso_cal

Source code in pdr/formats/ground.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
def mssso_cal_start_byte(name, hdulist):
    """
    A small subset of MSSSO CASPIR calibration images have the wrong start byte 
    for the IMAGE pointer in their PDS3 labels

    HITS
    * sl9_jupiter_impact
        * mssso_cal
    """
    if 'HEADER' in name:
        return 0
    return hdulist.fileinfo(0)['datLoc']

trivial_header_loader()

The HEADER pointer is just the SPREADSHEET table's header row, and it does not open because "BYTES = UNK"

HITS * apollo * BUG

Source code in pdr/formats/ground.py
55
56
57
58
59
60
61
62
63
64
65
66
67
def trivial_header_loader():
    """
    The HEADER pointer is just the SPREADSHEET table's header row, and it does 
    not open because "BYTES = UNK"

    HITS
    * apollo
        * BUG
    """
    warnings.warn(
        f"This product's HEADER pointer is not currently supported."
    )
    return True

wff_atm_special_block(data, name)

One WFF/ATM DEM image opens fine (BBMESA2X2), the other two (SCHOONER2X2 and SEDAN2X2) have their LINES and LINE_SAMPLES values backwards.

HITS * wff_atm * dem_img

Source code in pdr/formats/ground.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def wff_atm_special_block(data, name):
    """
    One WFF/ATM DEM image opens fine (BBMESA2X2), the other two (SCHOONER2X2 
    and SEDAN2X2) have their LINES and LINE_SAMPLES values backwards.

    HITS
    * wff_atm
        * dem_img
    """
    block = data.metablock_(name)

    if data.metaget_("PRODUCT_ID").startswith("S"):
        real_line_samples = block["LINES"]
        real_lines = block["LINE_SAMPLES"]

        block["LINES"] = real_lines
        block["LINE_SAMPLES"] = real_line_samples
        return True, block

    return False, block

formats.ihw

add_newlines_table_loader(fmtdef_dt, block, filename, start_byte)

Some Halley V1.0 tables (MSN, PPN, and IRSN datasets) are missing newline characters between rows. (Also applies to some ICE ephemeris tables)

HITS * ihw * ms_radar * ms_vis * ice * ephem_tbl

Source code in pdr/formats/ihw.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def add_newlines_table_loader(fmtdef_dt, block, filename, start_byte):
    """
    Some Halley V1.0 tables (MSN, PPN, and IRSN datasets) are missing
    newline characters between rows. (Also applies to some ICE ephemeris tables)

    HITS
    * ihw
        * ms_radar
        * ms_vis
    * ice
        * ephem_tbl
    """
    from io import StringIO
    import pandas as pd
    from pdr.utils import head_file

    with head_file(filename) as f:
        f.read(start_byte)
        newlines_added = bytearray()
        for row in range(0, block["ROWS"]):
            bytes_ = f.read(block["ROW_BYTES"])
            newlines_added += bytes_ + b"\n" # Add a newline to each row
    string_buffer = StringIO(newlines_added.decode())

    # Adapted from _interpret_as_ascii()
    fmtdef, dt = fmtdef_dt
    colspecs = []
    for record in fmtdef.to_dict("records"):
        col_length = int(record["BYTES"])
        colspecs.append((record["SB_OFFSET"], record["SB_OFFSET"] + col_length))
    string_buffer.seek(0)
    table = pd.read_fwf(string_buffer, header=None, colspecs=colspecs)
    string_buffer.close()
    table.columns = fmtdef.NAME.tolist()
    table = table.drop([k for k in table.keys() if "PLACEHOLDER" in k], axis=1)
    return table

curve_table_loader(filename, fmtdef_dt)

The labels do not always count column bytes correctly.

HITS * ihw_isrn * curve

Source code in pdr/formats/ihw.py
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
def curve_table_loader(filename, fmtdef_dt):
    """
    The labels do not always count column bytes correctly.

    HITS
    * ihw_isrn
        * curve
    """
    import pandas as pd
    names = [c for c in fmtdef_dt[0].NAME if "PLACEHOLDER" not in c]
    table = pd.read_csv(filename, header=None, sep=r"\s+")
    assert len(table.columns) == len(names), "mismatched column count"
    table.columns = names
    return table

get_special_block(data, name)

A handful of MSN Radar tables have column names that were not reading correctly and were ending up as "NaN". Which also caused an AttributeError when running ix check.

HITS * ihw * ms_radar

Source code in pdr/formats/ihw.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def get_special_block(data, name):
    """
    A handful of MSN Radar tables have column names that were not reading
    correctly and were ending up as "NaN". Which also caused an AttributeError 
    when running ix check.

    HITS
    * ihw
        * ms_radar
    """
    block = data.metablock_(name)
    for item in iter(block.items()):
        if "COLUMN" in item:
            if item[1]["START_BYTE"] == 17 and "NAME" not in item[1]:
                item[1].add("NAME", ">=1SEC")
            if item[1]["START_BYTE"] == 21 and "NAME" not in item[1]:
                item[1].add("NAME", ">=8SEC")
    return block

get_structure(block, name, filename, data, identifiers)

SSN products with a SPECTRUM pointer were opening with an incorrect column name.

HITS * ihw * spec_hal_cal

Source code in pdr/formats/ihw.py
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def get_structure(block, name, filename, data, identifiers):
    """
    SSN products with a SPECTRUM pointer were opening with an incorrect
    column name.

    HITS
    * ihw
        * spec_hal_cal
    """
    from pdr.loaders.queries import read_table_structure
    from pdr.pd_utils import insert_sample_types_into_df

    fmtdef = read_table_structure(
        block, name, filename, data, identifiers
    )
    fmtdef.at[0, "NAME"] = fmtdef.at[0, "COLUMN_NAME"]

    fmtdef, dt = insert_sample_types_into_df(fmtdef, identifiers)
    return fmtdef, dt

formats.juno

bit_start_find_and_fix(list_of_pvl_objects_for_bit_columns, start_bit_list)

HITS * juno_jiram * LOG_IMG_RDR * LOG_SPE_RDR * LOG_IMG_EDR * LOG_SPE_EDR * mgs_tes * ATM * BOL * OBS * RAD_tab * pvo * pos_sedr

Source code in pdr/formats/juno.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def bit_start_find_and_fix(
    list_of_pvl_objects_for_bit_columns, start_bit_list
):
    """
    HITS
    * juno_jiram
        * LOG_IMG_RDR
        * LOG_SPE_RDR
        * LOG_IMG_EDR
        * LOG_SPE_EDR
    * mgs_tes
        * ATM
        * BOL
        * OBS
        * RAD_tab
    * pvo
        * pos_sedr
    """
    if (
        list_of_pvl_objects_for_bit_columns[-1].get("NAME")
        == "NADIR_OFFSET_SIGN"
    ):
        special_start_bit_list = start_bit_list
        special_start_bit_list[-1] = 16
        return True, special_start_bit_list
    return False, None

jiram_rdr_sample_type()

JIRAM RDRs, both images and tables, are labeled as MSB but are actually LSB.

HITS * juno_jiram * IMG_RDR * SPE_RDR

Source code in pdr/formats/juno.py
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
def jiram_rdr_sample_type():
    """
    JIRAM RDRs, both images and tables, are labeled as MSB but
    are actually LSB.

    HITS
    * juno_jiram
        * IMG_RDR
        * SPE_RDR
    """
    return "<f"

uvs_edr_start_byte(name, hdul)

Sometimes, the start byte is incorrectly recorded in the PDS3 labels (It is always wrong in the PDS4 labels. We do not have a "check" for that yet, so I recommend using the PDS3 labels). Here we use the FITS index defined by the mission for each object to look up the correct start_byte in the HDU fileinfo.

This won't work if HDUs are missing etc, but I have not encountered that.

HITS * juno_uvs * EDR

Source code in pdr/formats/juno.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def uvs_edr_start_byte(name, hdul):
    """
    Sometimes, the start byte is incorrectly recorded in the PDS3 labels (It
    is always wrong in the PDS4 labels. We do not have a "check" for that yet,
    so I recommend using the PDS3 labels). Here we use the FITS index
    defined by the mission for each object to look up the correct start_byte in
    the HDU fileinfo.

    This won't work if HDUs are missing etc, but I have not encountered that.

    HITS
    * juno_uvs
        * EDR
    """
    import warnings
    # indices are in online PDS docs and comments in the labels
    index_dict = {'SPECTRAL_VS_SPATIAL_HEADER': 0,
                  'SPECTRAL_VS_SPATIAL_IMAGE': 0,
                  'SPATIAL_VS_TIME_HEADER': 1,
                  'SPATIAL_VS_TIME_QUBE': 1,
                  'FRAME_LIST_HEADER': 2,
                  'FRAME_LIST_TABLE': 2,
                  'SCAN_MIRROR_POSITIONS_HEADER': 3,
                  'SCAN_MIRROR_POSITIONS_TABLE': 3,
                  'RAW_FRAME_HEADER': 4,
                  'RAW_FRAME_TABLE': 4,
                  'ANALOG_COUNT_RATE_HEADER': 5,
                  'ANALOG_COUNT_RATE_TABLE': 5,
                  'DIGITAL_COUNT_RATE_HEADER': 6,
                  'DIGITAL_COUNT_RATE_TABLE': 6,
                  'PULSE_HEIGHT_DISTRIBUTION_LA_HEADER': 7,
                  'PULSE_HEIGHT_DISTRIBUTION_LA_QUBE': 7,
                  'PULSE_HEIGHT_DISTRIBUTION_STELLAR_HEADER': 8,
                  'PULSE_HEIGHT_DISTRIBUTION_STELLAR_QUBE': 8,
                  'PULSE_HEIGHT_DISTRIBUTION_STIM_HEADER': 9,
                  'PULSE_HEIGHT_DISTRIBUTION_STIM_QUBE': 9,
                  'HOUSEKEEPING_HEADER': 10,
                  'HOUSEKEEPING_TABLE': 10,
                  'PARAMETER_LIST_HEADER': 11,
                  'PARAMETER_LIST_TABLE': 11,
                  }

    try:
        correct_index = index_dict[name]
        hdu = hdul[correct_index]
        hinfo = hdu.fileinfo()
        if 'HEADER' in name:
            return hinfo['hdrLoc']
        else:
            return hinfo['datLoc']
    except Exception as e:
        warnings.warn("This key doesn't appear to be in the FITS file.")
        return None

uvs_rdr_start_byte(name, hdul)

Sometimes, the start byte is incorrectly recorded in the PDS3 labels (It is always wrong in the PDS4 labels. We do not have a "check" for that yet, so I recommend using the PDS3 labels). Here we use the FITS index defined by the mission for each object to look up the correct start_byte in the HDU fileinfo.

This won't work if HDUs are missing etc, but I have not encountered that.

HITS * juno_uvs * RDR

Source code in pdr/formats/juno.py
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def uvs_rdr_start_byte(name, hdul):
    """
    Sometimes, the start byte is incorrectly recorded in the PDS3 labels (It
    is always wrong in the PDS4 labels. We do not have a "check" for that yet,
    so I recommend using the PDS3 labels). Here we use the FITS index
    defined by the mission for each object to look up the correct start_byte in
    the HDU fileinfo.

    This won't work if HDUs are missing etc, but I have not encountered that.

    HITS
    * juno_uvs
        * RDR
    """
    import warnings
    # indices are in online PDS docs and comments in the labels
    index_dict = {'CALIBRATED_SPECTRAL_HEADER': 0,
                  'CALIBRATED_SPECTRAL_IMAGE': 0,
                  'ACQUISITION_LIST_HEADER': 1,
                  'ACQUISITION_LIST_TABLE': 1,
                  'CALIBRATED_PHOTON_LIST_HEADER': 2,
                  'CALIBRATED_PHOTON_LIST_TABLE': 2,
                  'ANCILLARY_DATA_HEADER': 3,
                  'ANCILLARY_DATA_TABLE': 3,
                  'CALIBRATED_ANALOG_COUNT_RATE_HEADER': 4,
                  'CALIBRATED_ANALOG_COUNT_RATE_TABLE': 4,
                  'CALIBRATED_DIGITAL_COUNT_RATE_HEADER': 5,
                  'CALIBRATED_DIGITAL_COUNT_RATE_TABLE': 5,
                  'HOUSEKEEPING_HEADER': 6,
                  'HOUSEKEEPING_TABLE': 6,
                  'WAVELENGTH_LOOKUP_HEADER': 7,
                  'WAVELENGTH_LOOKUP_IMAGE': 7,
                  'MASK_INFORMATION_HEADER': 8,
                  'MASK_INFORMATION_TABLE': 8,
                  }
    try:
        correct_index = index_dict[name]
        hdu = hdul[correct_index]
        hinfo = hdu.fileinfo()
        if 'HEADER' in name:
            return hinfo['hdrLoc']
        else:
            return hinfo['datLoc']
    except Exception as e:
        warnings.warn("This key doesn't appear to be in the FITS file.")
        return None

waves_burst_fix_table_names(data, name)

WAVES burst files that include frequency offset tables have mismatched pointer/object names.

HITS * juno_waves * CDR_BURST

Source code in pdr/formats/juno.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def waves_burst_fix_table_names(data, name):
    """
    WAVES burst files that include frequency offset tables have mismatched
    pointer/object names.

    HITS
    * juno_waves
        * CDR_BURST
    """
    if name == "DATA_TABLE":
        object_name = "TABLE"
    elif name == "FREQ_OFFSET_TABLE":
        object_name = "DATA_TABLE"
    block = data.metablock_(object_name)
    return block

formats.lro

DoesNotExistError

Bases: Exception

Source code in pdr/formats/lro.py
132
133
134
class DoesNotExistError(Exception):
    """"""
    pass

crater_bit_col_sample_type(base_samp_info)

HITS * lro_crater * edr_sec * edr_hk

Source code in pdr/formats/lro.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def crater_bit_col_sample_type(base_samp_info):
    """
    HITS
    * lro_crater
        * edr_sec
        * edr_hk
    """
    from pdr.datatypes import sample_types

    sample_type = base_samp_info["SAMPLE_TYPE"]
    sample_bytes = base_samp_info["BYTES_PER_PIXEL"]
    if "BIT_STRING" == sample_type:
        sample_type = "MSB_BIT_STRING"
        return True, sample_types(
            sample_type, int(sample_bytes), for_numpy=True
        )
    if "N/A" in sample_type:
        sample_type = "MSB_UNSIGNED_INTEGER"
        return True, sample_types(
            sample_type, int(sample_bytes), for_numpy=True
        )
    return False, None

get_crater_offset()

lro crater edr products have a header table with 64 bytes per row, the second table start byte is given in rows (also the wrong row) but had a different number of row bytes

HITS * lro_crater * edr_sec * edr_hk

Source code in pdr/formats/lro.py
23
24
25
26
27
28
29
30
31
32
33
34
def get_crater_offset():
    """
    lro crater edr products have a header table with 64 bytes per row, the
    second table start byte is given in rows (also the wrong row) but had a
    different number of row bytes

    HITS
    * lro_crater
        * edr_sec
        * edr_hk
    """
    return True, 64

lamp_edr_hdu_exceptions(name, hdulist)

Sometimes all the LAMP EDR table pointers exist, sometimes they aren't actually there.

HITS * lro_lamp * edr

Source code in pdr/formats/lro.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
def lamp_edr_hdu_exceptions(name, hdulist):
    """
    Sometimes all the LAMP EDR table pointers exist, sometimes they aren't 
    actually there.

    HITS
    * lro_lamp
        * edr
    """
    if name == "ACQUISITION_LIST_TABLE":
        extname = "Acquisition List"
    elif name == "FRAME_DATA_TABLE":
        extname = "Raw Frame Data"
    elif name == "CALCULATED_COUNTRATE_TABLE":
        extname = "Calculated Countrate"
    elif name == "LTS_DATA_TABLE":
        extname = "LTS Data"
    elif name == "HOUSEKEEPING_TABLE":
        extname = "Housekeeping Data"
    else:
        # Nothing should hit this, but it's here in case there is a rogue 
        # product with a [*]_TABLE pointer missed above
        return False, None

    if hdulist.fileinfo(extname)['datSpan'] == 0:
        raise DoesNotExistError(
            f"The {name}'s length is zero; the table does not actually exist."
        )
    return False, None

lamp_rdr_hdu_start_byte(name, hdulist)

This special case raises an error if a pointer's data doesn't actually exist, and returns the correct start byte if it does.

HITS * lro_lamp * rdr

Source code in pdr/formats/lro.py
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
def lamp_rdr_hdu_start_byte(name, hdulist):
    """
    This special case raises an error if a pointer's data doesn't actually 
    exist, and returns the correct start byte if it does.

    HITS
    * lro_lamp
        * rdr
    """
    if "ACQUISITION_LIST" in name:
        extname = "Acquisition List"
    elif "CAL_PIXELLIST_DATA" in name:
        extname = "Calibrated Pixel List Mode Data"
    elif "ANCILLARY_DATA" in name:
        extname = "Ancillary Data"
    elif "CAL_HISTOGRAM_" in name:
        # The multiple CAL_HISTOGRAM_[...]_IMAGE pointers all point at the same 
        # FITS HDU (each pointer illegally represents one image in the cube).
        extname = "Calibrated Histogram Mode Data"
    elif "CAL_CALCULATED_COUNTRATE" in name:
        extname = "Calculated Countrate"
        try:
            # Check to see if this is the correct 'EXTNAM' in the fits HDU
            hdulist.fileinfo(extname)
        except:
            # Sometimes this pointer refers to a different HDU extension name
            extname = "Reduced Count Rate"
    elif "LTS_DATA" in name:
        extname = "LTS Data"
    elif "HOUSEKEEPING" in name:
        extname = "Housekeeping Data"
    elif "WAVELENGTH_LOOKUP" in name:
        extname = "Wavelength Lookup Image"
    else:
        # The CAL_SPECTRAL_IMAGE_* pointers open fine
        return False, None

    if 'HEADER' in name:
        return True, hdulist.fileinfo(extname)['hdrLoc']
    if hdulist.fileinfo(extname)['datSpan'] == 0:
        raise DoesNotExistError(
            f"The {name}'s length is zero; the data object does not actually exist."
        )
    return True, hdulist.fileinfo(extname)['datLoc']

lamp_rdr_histogram_image_loader(data)

Products can have multiple unique pointers that are defined by a single image object (CAL_HISTOGRAM_DATA_IMAGE).

Source code in pdr/formats/lro.py
15
16
17
18
19
20
def lamp_rdr_histogram_image_loader(data):
    """Products can have multiple unique pointers that are
    defined by a single image object (CAL_HISTOGRAM_DATA_IMAGE)."""
    object_name = "CAL_HISTOGRAM_DATA_IMAGE"
    block = data.metablock_(object_name)
    return block

mini_rf_image_loader(data, name)

one of the mosaic labels has the wrong values for lines/line_samples

HITS * lro_mini_rf * mosaic

Source code in pdr/formats/lro.py
77
78
79
80
81
82
83
84
85
86
87
88
def mini_rf_image_loader(data, name):
    """
    one of the mosaic labels has the wrong values for lines/line_samples

    HITS
    * lro_mini_rf
        * mosaic
    """
    block = data.metablock_(name)
    block["LINES"] = 5760
    block["LINE_SAMPLES"] = 11520
    return block

mini_rf_spreadsheet_loader(filename, fmtdef_dt)

Mini-RF housekeeping CSVs have variable-width columns but the labels treat them as fixed-width.

HITS * lro_mini_rf * housekeeping

Source code in pdr/formats/lro.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def mini_rf_spreadsheet_loader(filename, fmtdef_dt):
    """
    Mini-RF housekeeping CSVs have variable-width columns but the labels treat 
    them as fixed-width. 

    HITS
    * lro_mini_rf
        * housekeeping
    """
    import pandas as pd

    fmtdef, dt = fmtdef_dt
    # The names argument is used here to explicitly set the number of columns
    # to 3. Otherwise the first row (which only has 1 column) confuses read_csv
    table = pd.read_csv(filename, header=None, sep=",",
                        names = ("POINT_NAME", "VALUE", "UNITS"))
    assert len(table.columns) == len(fmtdef.NAME.tolist())
    table.columns = fmtdef.NAME.tolist()
    return table

rss_get_position(identifiers, block, target, name, start_byte)

The RSS WEA products' WEAREC_TABLE undercounts ROW_BYTES by 1

HITS * lro_rss * wea

Source code in pdr/formats/lro.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def rss_get_position(identifiers, block, target, name, start_byte):
    """
    The RSS WEA products' WEAREC_TABLE undercounts ROW_BYTES by 1

    HITS
    * lro_rss
        * wea
    """
    table_props = table_position(identifiers, block, target, name, start_byte)
    n_records = block["ROWS"]
    record_bytes = block["ROW_BYTES"] + 1
    length = n_records * record_bytes
    table_props["length"] = length
    return True, table_props

wea_table_loader(filename, fmtdef_dt)

Some, but not all, wea files have more bytes than the labels define per row.

HITS * lro_rss * wea

Source code in pdr/formats/lro.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def wea_table_loader(filename, fmtdef_dt):
    """
    Some, but not all, wea files have more bytes than the labels define per row.

    HITS
    * lro_rss
        * wea
    """
    import pandas as pd

    fmtdef, dt = fmtdef_dt

    table = pd.read_csv(filename, skiprows=1, header=None, sep=r':|\s+',
                        engine='python')
    table.columns = [
        f for f in fmtdef['NAME'] if not f.startswith('PLACEHOLDER')
    ]
    return table

formats.lroc

lroc_edr_sample_type()

LROC EDRs specify signed integers but appear to be unsigned.

HITS * lroc * NAC_EDR * WAC_EDR

Source code in pdr/formats/lroc.py
 3
 4
 5
 6
 7
 8
 9
10
11
12
def lroc_edr_sample_type():
    """
    LROC EDRs specify signed integers but appear to be unsigned.

    HITS
    * lroc
        * NAC_EDR
        * WAC_EDR
    """
    return ">B"

formats.mariner

get_special_block(data, name)

Mariner 9 IRIS tables have 316 ROW_PREFIX_BYTES followed by 1 column with 1500 ITEMS. The column's START_BYTE = 317, but it should be 1.

HITS * mariner * iris

Source code in pdr/formats/mariner.py
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
def get_special_block(data, name):
    """
    Mariner 9 IRIS tables have 316 ROW_PREFIX_BYTES followed by 1 column
    with 1500 ITEMS. The column's START_BYTE = 317, but it should be 1.

    HITS
    * mariner
        * iris
    """
    block = data.metablock_(name)
    block["COLUMN"]["START_BYTE"] = 1
    return block

formats.mer

rss_spreadsheet_loader(filename, fmtdef_dt)

The RSS UHFD labels have the wrong ROWS value for most products.

HITS * mer_rss *uhfd

Source code in pdr/formats/mer.py
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
def rss_spreadsheet_loader(filename, fmtdef_dt):
    """
    The RSS UHFD labels have the wrong ROWS value for most products.

    HITS
    * mer_rss
        *uhfd
    """
    import pandas as pd

    fmtdef, dt = fmtdef_dt
    table = pd.read_csv(filename, header=None, sep=",")
    assert len(table.columns) == len(fmtdef.NAME.tolist())
    table.columns = fmtdef.NAME.tolist()
    return table

formats.mex

aspera_ima_ddr_structure(block, name, filename, data, identifiers)

The ASPERA IMA DDR table opens correctly as written in its label, but the BYTES values for columns 3 and 4 are wrong.

HITS * mex_aspera * ima_ddr

Source code in pdr/formats/mex.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
def aspera_ima_ddr_structure(block, name, filename, data, identifiers):
    """
    The ASPERA IMA DDR table opens correctly as written in its label, but
    the BYTES values for columns 3 and 4 are wrong.

    HITS
    * mex_aspera
        * ima_ddr
    """
    from pdr.loaders.queries import read_table_structure

    fmtdef = read_table_structure(
        block, name, filename, data, identifiers
    )
    fmtdef.at[2, "BYTES"] = 12
    fmtdef.at[3, "BYTES"] = 12
    return fmtdef, None

aspera_table_loader(filename, fmtdef_dt)

The ASPERA IMA EDRs are ascii csv tables containing 2 data types: SENSOR and MODE. The VALUES column is repeated and has 96 items total. In the MODE rows only the first VALUES item contains data, and should be followed by 95 'missing' items. In reality these rows have 96 empty/missing items because of an extra comma. This special case cuts off the extra column during the pd.read_csv() call.

HITS * mex_aspera * ima

Source code in pdr/formats/mex.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def aspera_table_loader(filename, fmtdef_dt):
    """
    The ASPERA IMA EDRs are ascii csv tables containing 2 data types: SENSOR
    and MODE. The VALUES column is repeated and has 96 items total. In the MODE
    rows only the first VALUES item contains data, and should be followed by 95
    'missing' items.
    In reality these rows have 96 empty/missing items because of an extra
    comma. This special case cuts off the extra column during the pd.read_csv()
    call.

    HITS
    * mex_aspera
        * ima
    """
    import pandas as pd

    fmtdef, dt = fmtdef_dt
    table = pd.read_csv(
        filename, header=None, usecols=range(len(fmtdef.NAME.tolist()))
    )
    assert len(table.columns) == len(fmtdef.NAME.tolist())
    table.columns = fmtdef.NAME.tolist()
    return table

marsis_get_position(identifiers, block, target, name, start_byte)

HITS * mex_marsis * TEC_EDR

Source code in pdr/formats/mex.py
 6
 7
 8
 9
10
11
12
13
14
15
16
def marsis_get_position(identifiers, block, target, name, start_byte):
    """
    HITS
    * mex_marsis
        * TEC_EDR
    """
    table_props = table_position(identifiers, block, target, name, start_byte)
    n_records = identifiers["FILE_RECORDS"]
    record_bytes = 143
    table_props["length"] = n_records * record_bytes
    return table_props

mrs_ddr_atmo_position(identifiers, block, target, name, start_byte)

The MRS derived atmosphere profiles were opening with data cut off at the ends of the tables. Recalculating the table length with ROW_BYTES = 278 instead of 276 fixes it.

HITS * mex_mrs * occ_atmo

Source code in pdr/formats/mex.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
def mrs_ddr_atmo_position(identifiers, block, target, name, start_byte):
    """
    The MRS derived atmosphere profiles were opening with data cut off at the
    ends of the tables. Recalculating the table length with ROW_BYTES = 278
    instead of 276 fixes it.

    HITS
    * mex_mrs
        * occ_atmo
    """
    table_props = table_position(identifiers, block, target, name, start_byte)
    row_bytes = 278
    table_props["length"] = row_bytes * block["ROWS"]
    return table_props

mrs_get_position(identifiers, block, target, name, start_byte)

MRS ICL level 1b DOPPLER_TABLEs and ODF level 2 RANGING_TABLEs undercount ROW_BYTES by 1.

HITS * mex_mrs * lvl_1b_icl (partial) * lvl_2_odf (partial)

Source code in pdr/formats/mex.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def mrs_get_position(identifiers, block, target, name, start_byte):
    """
    MRS ICL level 1b DOPPLER_TABLEs and ODF level 2 RANGING_TABLEs undercount
    ROW_BYTES by 1.

    HITS
    * mex_mrs
        * lvl_1b_icl (partial)
        * lvl_2_odf (partial)
    """
    table_props = table_position(identifiers, block, target, name, start_byte)
    row_bytes = block["ROW_BYTES"] + 1
    table_props["length"] = row_bytes * block["ROWS"]
    return table_props

mrs_l1b_odf_rmp_redirect(data)

RMP tables are a subset of MRS level 1b ODFs that were not opening because their pointer and object names do not match.

HITS: * mex_mrs * lvl_1b_odf (partial)

Source code in pdr/formats/mex.py
148
149
150
151
152
153
154
155
156
157
158
159
def mrs_l1b_odf_rmp_redirect(data):
    """
    RMP tables are a subset of MRS level 1b ODFs that were not opening because
    their pointer and object names do not match.

    HITS:
    * mex_mrs
        * lvl_1b_odf (partial)
    """
    object_name = "RAMP_TABLE"
    block = data.metablock_(object_name)
    return block

mrs_l1b_odf_table_loader(filename, fmtdef_dt)

MRS level 1b ODF labels have variable and sometimes incorrect ROW_BYTES values.

HITS * mex_mrs * lvl_1b_odf

Source code in pdr/formats/mex.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def mrs_l1b_odf_table_loader(filename, fmtdef_dt):
    """
    MRS level 1b ODF labels have variable and sometimes incorrect ROW_BYTES
    values.

    HITS
    * mex_mrs
        * lvl_1b_odf
    """
    import pandas as pd

    fmtdef, dt = fmtdef_dt
    table = pd.read_csv(filename, header=None, sep=r"\s+")
    table.columns = [
        f for f in fmtdef['NAME'] if not f.startswith('PLACEHOLDER')
    ]
    return table

pfs_edr_special_block(data, name)

The PFS EDRs have a few errors in their labels prior to orbit 8945, after which they are corrected.

HITS * mex_marsis * raw_lwc * raw_swc * cal_lwc * cal_swc * hk_early_mission * orb001_lwc * orb001_swc

Source code in pdr/formats/mex.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
def pfs_edr_special_block(data, name):
    """
    The PFS EDRs have a few errors in their labels prior to orbit 8945, after
    which they are corrected.

    HITS
    * mex_marsis
        * raw_lwc
        * raw_swc
        * cal_lwc
        * cal_swc
        * hk_early_mission
        * orb001_lwc
        * orb001_swc
    """
    block = data.metablock_(name)
    orbit_number = data.metaget_("ORBIT_NUMBER")

    if orbit_number == "N/A" or int(orbit_number) < 8945:
        # Fixes the number of rows in the table by replacing ROWS with
        # FILE_RECORDS.
        block["ROWS"] = data.metaget_("FILE_RECORDS")
        # Replaces the time columns' DATA_TYPEs with the correct type based on
        # products created later in the mission.
        for item in iter(block.items()):
            if "COLUMN" in item:
                if item[1]["NAME"] == "OBT OBSERVATION TIME":
                    item[1]["DATA_TYPE"] = "PC_REAL"
                if item[1]["NAME"] == "SCET OBSERVATION TIME":
                    item[1]["DATA_TYPE"] = "PC_UNSIGNED_INTEGER"
        return True, block
    return False, block

vmc_rdr_hdu_selection(name, hdulist)

The VMC RDRs have 1 IMAGE pointer and 2 IMAGE objects. From the volume's readme: "The first layer includes the calibrated values, and the second layer includes the raw values." It is unclear whether or not the 'second layer' is a copy of the EDR image or if intermediate calibration steps have been applied to it. Assuming the single band image is akin to the EDRs, this special case returns the multiband calibrated image.

HITS * mex_vmc * rdr

Source code in pdr/formats/mex.py
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
def vmc_rdr_hdu_selection(name, hdulist):
    """
    The VMC RDRs have 1 IMAGE pointer and 2 IMAGE objects. From the volume's 
    readme: "The first layer includes the calibrated values, and the second 
    layer includes the raw values." It is unclear whether or not the 'second 
    layer' is a copy of the EDR image or if intermediate calibration steps 
    have been applied to it.
    Assuming the single band image is akin to the EDRs, this special case 
    returns the multiband calibrated image.

    HITS
    * mex_vmc
        * rdr
    """

    return hdulist.fileinfo(1)['datLoc']

formats.mgn

geom_table_loader(filename, fmtdef_dt)

The Magellan radar system geometry tables include null bytes between rows.

HITS * gal_nims * impact * mgn_image * midr_tables

Source code in pdr/formats/mgn.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def geom_table_loader(filename, fmtdef_dt):
    """
    The Magellan radar system geometry tables include null bytes between rows.

    HITS
    * gal_nims
        * impact
    * mgn_image
        * midr_tables
    """
    import pandas as pd
    from pdr.utils import head_file

    fmtdef, dt = fmtdef_dt
    with head_file(filename) as buf:
        bytes_ = buf.read().replace(b"\x00", b"")
    string_buffer = StringIO(bytes_.decode())
    string_buffer.seek(0)
    table = pd.read_csv(string_buffer, header=None)
    names = [n for n in fmtdef['NAME'] if 'PLACEHOLDER' not in n]
    assert len(table.columns) == len(names), 'column name mismatch'
    string_buffer.close()
    table.columns = names
    return table

get_fn(data)

HITS * mgn_post_mission * fmap * fmap_browse

Source code in pdr/formats/mgn.py
44
45
46
47
48
49
50
51
52
def get_fn(data):
    """
    HITS
    * mgn_post_mission
        * fmap
        * fmap_browse
    """
    target = data.filename
    return True, target

occultation_loader(identifiers, fmtdef_dt, block, filename)

Checks end of each row for newline character. If missing, removes extraneous newline from middle of the row and adjusts for the extra byte. Adapted from _interpret_as_ascii()

HITS * mgn_occult * ddr

Source code in pdr/formats/mgn.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def occultation_loader(identifiers, fmtdef_dt, block, filename):
    """
    Checks end of each row for newline character. If missing, removes
    extraneous newline from middle of the row and adjusts for the extra byte.
    Adapted from _interpret_as_ascii()

    HITS
    * mgn_occult
        * ddr
    """
    import pandas as pd

    fmtdef, dt = fmtdef_dt
    record_length = block["ROW_BYTES"]

    # Checks end of each row for newline character. If missing, removes extraneous
    # newline from middle of the row and adjusts for the extra byte.
    with head_file(filename) as f:
        processed = bytearray()
        for row in range(0, identifiers["FILE_RECORDS"]):
            bytes_ = f.read(record_length)
            if not bytes_.endswith(b"\n"):
                new_bytes_ = bytes_.replace(b"\n", b"") + f.read(1)
                processed += new_bytes_
            else:
                processed += bytes_
    string_buffer = StringIO(processed.decode())
    # adapted from _interpret_as_ascii()
    colspecs = []
    position_records = fmtdef.to_dict("records")
    for record in position_records:
        col_length = record["BYTES"]
        colspecs.append((record["SB_OFFSET"], record["SB_OFFSET"] + col_length))
    string_buffer.seek(0)
    table = pd.read_fwf(string_buffer, header=None, colspecs=colspecs)
    string_buffer.close()

    table.columns = fmtdef.NAME.tolist()
    return table.drop("PLACEHOLDER_0", axis=1)

orbit_table_in_img_loader()

HITS * mgn_post_mission * fmap * fmap_browse

Source code in pdr/formats/mgn.py
34
35
36
37
38
39
40
41
def orbit_table_in_img_loader():
    """
    HITS
    * mgn_post_mission
        * fmap
        * fmap_browse
    """
    return True

formats.mgs

get_ecs_structure(block, name, filename, data, identifiers)

HITS * mgs_rss_raw * ecs

Source code in pdr/formats/mgs.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
def get_ecs_structure(block, name, filename, data, identifiers):
    """
    HITS
    * mgs_rss_raw
        * ecs
    """
    from pdr.pd_utils import insert_sample_types_into_df, compute_offsets
    fmtdef = read_table_structure(
        block, name, filename, data, identifiers
    )
    fmtdef.at[5, "START_BYTE"] = 80
    fmtdef[f"ROW_BYTES"] = block.get(f"ROW_BYTES")

    fmtdef = compute_offsets(fmtdef)
    fmtdef, dt = insert_sample_types_into_df(fmtdef, identifiers)
    return fmtdef, dt

get_odf_structure(block, name, filename, data, identifiers)

Source code in pdr/formats/mgs.py
 6
 7
 8
 9
10
11
12
13
14
15
16
def get_odf_structure(block, name, filename, data, identifiers):
    """"""
    from pdr.pd_utils import insert_sample_types_into_df
    fmtdef = read_table_structure(
        block, name, filename, data, identifiers
    )
    fmtdef.at[7, "BYTES"] = 2
    fmtdef[f"ROW_BYTES"] = block.get(f"ROW_BYTES")

    fmtdef, dt = insert_sample_types_into_df(fmtdef, identifiers)
    return fmtdef, dt

mola_pedr_special_block(data, name, identifiers)

Fix for FILE_RECORDS = "UNK" and ROWS = "UNK" in the MOLA PEDR labels. This special case calculates ROWS using the count_from_bottom_of_file() logic in reverse.

HITS * mgs_mola * pedr * mgs_sampler * pedr

Source code in pdr/formats/mgs.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
def mola_pedr_special_block(data, name, identifiers):
    """
    Fix for FILE_RECORDS = "UNK" and ROWS = "UNK" in the MOLA PEDR labels.
    This special case calculates ROWS using the count_from_bottom_of_file()
    logic in reverse.

    HITS
    * mgs_mola
        * pedr
    * mgs_sampler
        * pedr
    """
    import os
    from pathlib import Path
    from pdr.loaders.queries import data_start_byte

    block = data.metablock_(name)
    target = data.metaget_("^"+name)
    start_byte = data_start_byte(identifiers, block, target, data.filename)

    table_bytes = os.path.getsize(Path(data.filename)) - start_byte
    block["ROWS"] = int(table_bytes / block["ROW_BYTES"])

    return block

formats.mro

ancil_table_loader(filename, fmtdef_dt)

In the CRISM ancillary OBS tables, missing values are variations of "N/A", which causes mixed dtype warnings when the first row contains N/A's.

HITS * crism * extras_obs

Source code in pdr/formats/mro.py
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
def ancil_table_loader(filename, fmtdef_dt):
    """
    In the CRISM ancillary OBS tables, missing values are variations of "N/A", 
    which causes mixed dtype warnings when the first row contains N/A's.

    HITS
    * crism
        * extras_obs
    """
    import pandas as pd

    missing_const = ['N/A  ', 'N/A   ', 'N/A             ', 
                     'N/A                       ',]
    table = pd.read_csv(filename, header=None,
                        na_values=missing_const,
                        dtype={0:str, 44:str, 46:str, 47:str, 48:str})

    col_names = [c for c in fmtdef_dt[0]['NAME'] if "PLACEHOLDER" not in c]
    assert len(table.columns) == len(col_names), "mismatched column count"
    table.columns = col_names
    return table

crism_mrdr_ancill_position(identifiers, block, target, name, start_byte)

ROW_BYTES = 14 in the labels, but it should be 16 (the RECORD_BYTES)

HITS * crism * ancil_mrdr

Source code in pdr/formats/mro.py
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
def crism_mrdr_ancill_position(identifiers, block, target, name, start_byte):
    """
    ROW_BYTES = 14 in the labels, but it should be 16 (the RECORD_BYTES)

    HITS
    * crism
        * ancil_mrdr
    """
    from pdr.loaders.queries import table_position

    table_props = table_position(identifiers, block, target, name, start_byte)
    n_rows = block["ROWS"]
    row_bytes = identifiers["RECORD_BYTES"]
    table_props["length"] = n_rows * row_bytes
    return table_props

get_structure(block, name, filename, data, identifiers)

The first column in the MCS (EDR/RDR/DDR) format files are just named "1" which is being read as 'int'. This was causing problems in read_table during the table.drop call

HITS * mro * mcs_edr * mcs_rdr

Source code in pdr/formats/mro.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def get_structure(block, name, filename, data, identifiers):
    """
    The first column in the MCS (EDR/RDR/DDR) format files are just named "1"
    which is being read as 'int'. This was causing problems in read_table
    during the table.drop call

    HITS
    * mro
        * mcs_edr
        * mcs_rdr
    """
    fmtdef = read_table_structure(
        block, name, filename, data, identifiers
    )
    fmtdef["NAME"] = fmtdef["NAME"].values.astype(str)
    return fmtdef, None

mcs_ddr_oldformat_trivial()

These files are outdated and have formatting issues that make the current table reader (mcs_ddr_table_loader below) not work.

The tables can be sometimes loaded by mcs_ddr_table_loader if you subtract two from the start_byte, but this has not been exhaustively tested.

HITS: * mro * mcs_ddr_v1

Source code in pdr/formats/mro.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def mcs_ddr_oldformat_trivial():
    """
    These files are outdated and have formatting issues that make the current
    table reader (mcs_ddr_table_loader below) not work.

    The tables can be sometimes loaded by mcs_ddr_table_loader if you subtract
    two from the start_byte, but this has not been exhaustively tested.

    HITS:
    * mro
        * mcs_ddr_v1
    """
    import warnings
    warnings.warn('The V1.0 MRO MCS DDR tables (from MCSDDRV1) are not '
                  'supported by PDR, use a more recent version of the DDR'
                  ' Tables on the PDS.')
    return True

mcs_ddr_table_loader(block, filename, start_byte)

The newer (V6.0 and above) DDR files can be opened into a dataframe with some massaging. The dataset records have a metadata block (described by MCS_DDR1.FMT) followed by 105 lines of data (each described by MCS_DDR2.FMT, the 105 is "repetitions" in the label). This continues until the end of the file.

For the purposes of outputting a single table, the metadata block info is added to each row of 105 data rows that follow it. So per record block, 105 lines are added to the dataframe. This is because the metadata and data rows have different columns, so they can't be in the same table as alternating rows as in the .tab file structure.

The MCS DDR V1.0, which is now out of date at the node, doesn't quite work with this code. If the start_byte is set to 2888 it seems to work on a few cases, but this has not fully been tested via ix.

HITS: * mro * mcs_ddr

Source code in pdr/formats/mro.py
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
def mcs_ddr_table_loader(block, filename, start_byte):
    """
    The newer (V6.0 and above) DDR files can be opened into a dataframe with
    some massaging. The dataset records have a metadata block (described by
    MCS_DDR1.FMT) followed by 105 lines of data (each described by
    MCS_DDR2.FMT, the 105 is "repetitions" in the label). This continues until
    the end of the file.

    For the purposes of outputting a single table, the metadata block info is
    added to each row of 105 data rows that follow it. So per record block, 105
    lines are added to the dataframe. This is because the metadata and data
    rows have different columns, so they can't be in the same table as
    alternating rows as in the .tab file structure.

    The MCS DDR V1.0, which is now out of date at the node, doesn't quite work
    with this code. If the start_byte is set to 2888 it seems to work on a few
    cases, but this has not fully been tested via ix.

    HITS:
    * mro
        * mcs_ddr
    """
    import numpy as np
    import pandas as pd
    import warnings

    # Combined column and dtypes as described in the two format files
    # "QUAL" was called "1" but that is confusing and not meaningful re: how
    # the format label described it.
    columns = [
        "QUAL", "DATE", "UTC", "SCLK", "L_S", "SOLAR_DIST", "ORB_NUM", "GQUAL",
        "SOLAR_LAT", "SOLAR_LON", "SOLAR_ZEN", "LTST", "PROFILE_LAT",
        "PROFILE_LON", "PROFILE_RAD", "PROFILE_ALT", "LIMB_ANG", "ARE_RAD",
        "SURF_LAT", "SURF_LON", "SURF_RAD", "T_SURF", "T_SURF_ERR",
        "T_NEAR_SURF", "T_NEAR_SURF_ERR", "DUST_COLUMN", "DUST_COLUMN_ERR",
        "H2OVAP_COLUMN", "H2OVAP_COLUMN_ERR", "H2OICE_COLUMN",
        "H2OICE_COLUMN_ERR", "CO2ICE_COLUMN", "CO2ICE_COLUMN_ERR", "P_SURF",
        "P_SURF_ERR", "P_RET_ALT", "P_RET", "P_RET_ERR", "RQUAL", "P_QUAL",
        "T_QUAL", "DUST_QUAL", "H2OVAP_QUAL", "H2OICE_QUAL", "CO2ICE_QUAL",
        "SURF_QUAL", "OBS_QUAL", "REF_SCLK_0", "REF_SCLK_1", "REF_SCLK_2",
        "REF_SCLK_3", "REF_SCLK_4", "REF_SCLK_5", "REF_SCLK_6", "REF_SCLK_7",
        "REF_SCLK_8", "REF_SCLK_9", "REF_DATE_0", "REF_UTC_0", "REF_DATE_1",
        "REF_UTC_1", "REF_DATE_2", "REF_UTC_2", "REF_DATE_3", "REF_UTC_3",
        "REF_DATE_4", "REF_UTC_4", "REF_DATE_5", "REF_UTC_5", "REF_DATE_6",
        "REF_UTC_6", "REF_DATE_7", "REF_UTC_7", "REF_DATE_8", "REF_UTC_8",
        "REF_DATE_9", "REF_UTC_9","1_layer", "PRES", "T", "T_ERR", "DUST",
        "DUST_ERR", "H2OVAP", "H2OVAP_ERR", "H2OICE", "H2OICE_ERR", "CO2ICE",
        "CO2ICE_ERR", "ALT", "LAT", "LON"
    ]
    dtypes = [
        "string", "string", "string", "float64", "float64", "float64",
        "int64", "int64", "float64", "float64", "float64", "float64",
        "float64", "float64", "float64", "float64", "float64", "float64",
        "float64", "float64", "float64", "float64", "float64", "float64",
        "float64", "float64", "float64", "float64", "float64", "float64",
        "float64", "float64", "float64", "float64", "float64", "float64",
        "float64", "float64", "int64", "int64", "int64", "int64", "int64",
        "int64", "int64", "float64", "float64", "float64", "float64",
        "float64", "float64", "float64", "float64", "float64", "float64",
        "float64", "string", "string", "string", "string", "string",
        "string", "string", "string", "string", "string", "string", "string",
        "string", "string", "string", "string", "string", "string", "string",
        "string", "string", "string", "float64", "float64", "float64",
        "float64", "float64", "float64", "float64","float64", "float64",
        "float64", "float64", "float64", "float64", "float64"
    ]
    dtype_map = dict(zip(columns, dtypes))
    block_size = block['CONTAINER']['REPETITIONS']  # data rows per record

    with open(filename, "rb") as f:
        f.seek(start_byte)
        data = f.read()
    # record and metadata rows are divided by new line
    rows = [row.decode("ascii", errors="replace") for row in data.split(b"\n")]
    combined_rows = []
    i = 0
    while i < len(rows) - 1:
        # there are also random huge spaces between each record block. we strip
        # those out, along with extra quotation marks
        meta_fields = [f.strip().strip('"').strip('             ') for f in
                       rows[i].split(",")]
        if len(meta_fields) != 77:
            # standard length of a metadata row
            print(meta_fields)
            warnings.warn("Metadata block missing from expected location in "
                          "the DDR file.")
            raise TypeError("Expected metadata row not found")
        i += 1
        for r in rows[i: i + block_size]:
            # iterate all data rows after each metadata block, add metadata
            # info to each data row
            data_fields = [f.strip().strip('"').strip('             ') for f in
                           r.split(",")]
            if len(data_fields) != 15:
                # standard length of a data row
                warnings.warn("DDR file has incomplete record blocks. "
                              "Searching for next metadata block.")
                i -= 1
                continue
            combined_rows.append(meta_fields + data_fields)
        i += block_size
    result = pd.DataFrame(combined_rows, columns=columns)
    result = result.astype(dtype=dtype_map)

    return result

formats.msl_apxs

table_loader(pointer)

we don't support these right now, or maybe ever

HITS * msl_apxs * APXS_SCIENCE_EDR

Source code in pdr/formats/msl_apxs.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
def table_loader(pointer):
    """
    we don't support these right now, or maybe ever

    HITS
    * msl_apxs
        * APXS_SCIENCE_EDR
    """
    warnings.warn(
        f"The MSL APXS {pointer} tables are not currently supported."
    )
    return True

trivial_header_loader()

The HEADER pointer is just the SPREADSHEET table's header row, and it does not open because "BYTES = UNK"

HITS * msl_apxs * APXS_OXIDE_RDR * APXS_SPECTRUM_RDR

Source code in pdr/formats/msl_apxs.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
def trivial_header_loader():
    """
    The HEADER pointer is just the SPREADSHEET table's header row, and it does 
    not open because "BYTES = UNK"

    HITS
    * msl_apxs
        * APXS_OXIDE_RDR
        * APXS_SPECTRUM_RDR
    """
    warnings.warn(
        f"The MSL APXS RDR HEADER pointers are not currently supported."
    )
    return True

formats.msl_ccam

image_reply_table_loader()

HITS * msl_ccam * CCAM_RMI_EDR

Source code in pdr/formats/msl_ccam.py
 6
 7
 8
 9
10
11
12
13
14
15
16
def image_reply_table_loader():
    """
    HITS
    * msl_ccam
        * CCAM_RMI_EDR
    """
    warnings.warn(
        "MSL ChemCam IMAGE_REPLY binary tables are not supported "
        "due to a formatting error in label files."
    )
    return True

formats.msl_cmn

fix_mangled_name(data)

HITS * msl_cmn * HOUSEKEEPING

Source code in pdr/formats/msl_cmn.py
44
45
46
47
48
49
50
51
52
def fix_mangled_name(data):
    """
    HITS
    * msl_cmn
        * HOUSEKEEPING
    """
    object_name = "CHMN_HSKN_HEADER_TABLE"
    block = data.metablock_(object_name)
    return block

get_offset(object_name)

incorrectly specifies object length rather than start byte

HITS * msl_cmn * DIFFRACTION_ALL_RDR * ENERGY_SINGLE_RDR * MINERAL_TABLES * CCD_FRAME * DIFFRACTION_SINGLE * DIFFRACTION_SPLIT * DIFFRACTION_ALL * ENERGY_ALL * ENERGY_SINGLE * ENERGY_SPLIT * HOUSKEEPING * TRANSMIT_RAW

Source code in pdr/formats/msl_cmn.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def get_offset(object_name):
    """
    incorrectly specifies object length rather than start byte

    HITS
    * msl_cmn
        * DIFFRACTION_ALL_RDR
        * ENERGY_SINGLE_RDR
        * MINERAL_TABLES
        * CCD_FRAME
        * DIFFRACTION_SINGLE
        * DIFFRACTION_SPLIT
        * DIFFRACTION_ALL
        * ENERGY_ALL
        * ENERGY_SINGLE
        * ENERGY_SPLIT
        * HOUSKEEPING
        * TRANSMIT_RAW
    """
    if object_name == "HISTOGRAM":
        return True, 300
    if object_name == "CHMN_HSK_HEADER_TABLE":
        return True, 0
    return False, None

spreadsheet_loader(filename)

HITS * msl_cmn * DIFFRACTION_ALL_RDR * ENERGY_SINGLE_RDR * MINERAL_TABLES * msl_sam * l0_qms * l1a_qms * l1b_qms

Source code in pdr/formats/msl_cmn.py
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
def spreadsheet_loader(filename):
    """
    HITS
    * msl_cmn
        * DIFFRACTION_ALL_RDR
        * ENERGY_SINGLE_RDR
        * MINERAL_TABLES
    * msl_sam
        * l0_qms
        * l1a_qms
        * l1b_qms
    """
    import pandas as pd
    return pd.read_csv(filename)

trivial_header_loader()

HITS * msl_cmn * DIFFRACTION_ALL_RDR * ENERGY_SINGLE_RDR * MINERAL_TABLES * msl_sam * l0_hk * l0_qms * l0_gc * l0_tls * l1a_hk * l1a_qms * l1a_gc * l1a_tls * l1b_qms * l1b_gc * l2_qms * l2_gc * l2_tls

Source code in pdr/formats/msl_cmn.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def trivial_header_loader():
    """
    HITS
    * msl_cmn
        * DIFFRACTION_ALL_RDR
        * ENERGY_SINGLE_RDR
        * MINERAL_TABLES
    * msl_sam
        * l0_hk
        * l0_qms
        * l0_gc
        * l0_tls
        * l1a_hk
        * l1a_qms
        * l1a_gc
        * l1a_tls
        * l1b_qms
        * l1b_gc
        * l2_qms
        * l2_gc
        * l2_tls
    """
    return True

formats.msl_places

spreadsheet_loader(filename, fmtdef_dt)

HITS * msl_places * localizations

Source code in pdr/formats/msl_places.py
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
def spreadsheet_loader(filename, fmtdef_dt):
    """
    HITS
    * msl_places
        * localizations
    """
    import pandas as pd

    fmtdef, dt = fmtdef_dt
    table = pd.read_csv(filename, sep=",")
    assert len(table.columns) == len(fmtdef.NAME.tolist())
    table.columns = fmtdef.NAME.tolist()
    return table

formats.msl_rems

edr_offset(data, name)

HITS: * msl_rems * edr_HSDEF # edr_HSREG

Source code in pdr/formats/msl_rems.py
29
30
31
32
33
34
35
36
37
def edr_offset(data, name):
    """
    HITS:
    * msl_rems
        * edr_HSDEF
        # edr_HSREG
    """
    start_byte = data.metaget_("^"+name)[1] - 1
    return True, start_byte

edr_table_loader(filename, fmtdef_dt, block, start_byte)

The ROW_SUFFIX_BYTES are either miscounted by a few bytes, or we don't handle them correctly. There appears to be a related issue with the tables' start bytes as well. This special case bypasses both issues.

HITS * msl_rems * edr_SP

Source code in pdr/formats/msl_rems.py
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
def edr_table_loader(filename, fmtdef_dt, block, start_byte):
    """
    The ROW_SUFFIX_BYTES are either miscounted by a few bytes, or we don't 
    handle them correctly. There appears to be a related issue with the tables' 
    start bytes as well. This special case bypasses both issues.

     HITS
    * msl_rems
        * edr_SP
    """
    import pandas as pd

    fmtdef, dt = fmtdef_dt

    # number of rows to skip (there are multiple table pointers per product)
    skips = int(start_byte / 399)
    table = pd.read_csv(filename, header=None, 
                        skiprows=skips,
                        nrows=block["ROWS"])

    col_names = [c for c in fmtdef_dt[0]['NAME'] if "PLACEHOLDER" not in c]
    assert len(table.columns) == len(col_names), "mismatched column count"
    table.columns = col_names
    return table

rdr_table_loader(filename, fmtdef_dt)

Missing values are variations of "UNK" and "NULL", which cause mixed dtype warnings when using the default pd.read_csv() parameters.

HITS * msl_rems * rdr_rmd * rdr_rnv * rdr_rtl

Source code in pdr/formats/msl_rems.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def rdr_table_loader(filename, fmtdef_dt):
    """
    Missing values are variations of "UNK" and "NULL", which cause mixed dtype 
    warnings when using the default pd.read_csv() parameters. 

    HITS
    * msl_rems
        * rdr_rmd
        * rdr_rnv
        * rdr_rtl
    """
    import pandas as pd

    missing_const = [' UNK', '    UNK', '     UNK', '      UNK',
                     '       UNK', '         UNK', 
                     '   NULL', '    NULL']
    table = pd.read_csv(filename, header=None,
                        na_values=missing_const)

    col_names = [c for c in fmtdef_dt[0]['NAME'] if "PLACEHOLDER" not in c]
    assert len(table.columns) == len(col_names), "mismatched column count"
    table.columns = col_names
    return table

formats.nh

get_fn(data)

The PEPSSI DDRs have an extra space at the start of the SPREADSHEET pointer's filename that causes 'file not found' errors.

HITS * nh_derived * atmos_comp * nh_pepssi * flux_resampled

Source code in pdr/formats/nh.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
def get_fn(data):
    """
    The PEPSSI DDRs have an extra space at the start of the SPREADSHEET
    pointer's filename that causes 'file not found' errors.

    HITS
    * nh_derived
        * atmos_comp
    * nh_pepssi
        * flux_resampled
    """
    label = Path(data.labelname)
    return True, Path(label.parent, f"{label.stem}.csv")

formats.odyssey

grs_e_kernel_loader(name, fn)

The GRS Experimenter's Notebook products have two "FILE" objects with one "TIME_SERIES" pointer each. The first object/pointer is for the time series table, the other is for a .TXT notes file. Because the text file's pointer has "SERIES" in it, pointer_to_loader() sends it to ReadTable().

This special case reads it with read_text() instead.

HITS * mars_odyssey * edr_e_kernel

Source code in pdr/formats/odyssey.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def grs_e_kernel_loader(name, fn):
    """
    The GRS Experimenter's Notebook products have two "FILE" objects with one 
    "TIME_SERIES" pointer each. The first object/pointer is for the time series 
    table, the other is for a .TXT notes file. Because the text file's pointer 
    has "SERIES" in it, pointer_to_loader() sends it to ReadTable(). 

    This special case reads it with read_text() instead.

    HITS
    * mars_odyssey
        * edr_e_kernel
    """
    from pdr.loaders.text import read_text

    return True, read_text(name, fn)

grs_e_kernel_structure()

Handles the same files as grs_e_kernel_loader() above, and is needed to avoid an error thrown before that special case can be called. Because the second TIME_SERIES pointer is not actually a table, parse_table_structure() fails when trying to make a fmtdef.

HITS * mars_odyssey * edr_e_kernel

Source code in pdr/formats/odyssey.py
37
38
39
40
41
42
43
44
45
46
47
48
def grs_e_kernel_structure():
    """
    Handles the same files as grs_e_kernel_loader() above, and is needed to 
    avoid an error thrown before that special case can be called. Because the 
    second TIME_SERIES pointer is not actually a table, parse_table_structure() 
    fails when trying to make a fmtdef.

    HITS
    * mars_odyssey
        * edr_e_kernel
    """
    return True, None

map_table_loader(filename, fmtdef_dt)

A few products open fine from their labels, but most do not. Seems like a byte counting issue in the labels.

HITS * mars_odyssey * maps

Source code in pdr/formats/odyssey.py
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
def map_table_loader(filename, fmtdef_dt):
    """
    A few products open fine from their labels, but most do not. Seems like
    a byte counting issue in the labels.

    HITS
    * mars_odyssey
        * maps
    """
    import pandas as pd
    names = [c for c in fmtdef_dt[0]['NAME'] if 'PLACEHOLDER' not in c]
    # Some tables use tabs as column delimiters, others use spaces.
    table = pd.read_csv(filename, header=None, sep=r"\s+")
    assert len(table.columns) == len(names), "Mismatched column count"
    table.columns = names
    return table

formats.phoenix

afm_rdr_structure(block, name, filename, data, identifiers)

AFM RDR header tables: Several columns' NAME fields start with lowercase letters, which is_an_assignment_line() in /parselabel/pds3.py evaluates as NOT an assignment statement.

HITS * phoenix * afm_rdr

Source code in pdr/formats/phoenix.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
def afm_rdr_structure(block, name, filename, data, identifiers):
    """
    AFM RDR header tables: Several columns' NAME fields start with lowercase
    letters, which is_an_assignment_line() in /parselabel/pds3.py evaluates as
    NOT an assignment statement.

    HITS
    * phoenix
        * afm_rdr
    """
    from pdr.loaders.queries import read_table_structure
    fmtdef = read_table_structure(block, name, filename, data, identifiers)
    fmtdef.insert(1, 'NAME', fmtdef.pop('NAME'))
    for line in range(0, len(fmtdef)):
        col_number_text = fmtdef.at[line, "COLUMN_NUMBER"]
        if (
            isinstance(col_number_text, str)
            and "NAME" in col_number_text
        ):
            fmtdef.at[
                line, "COLUMN_NUMBER"
            ] = col_number_text.split("NAME = ")[0]
            fmtdef.at[line, "NAME"] = col_number_text.split("NAME = ")[1]
    return fmtdef, None

afm_table_loader(filename, fmtdef_dt, name)

AFM RDR tables: Several labels miscount bytes somewhere in the tables

HITS * phoenix * afm_rdr

Source code in pdr/formats/phoenix.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def afm_table_loader(filename, fmtdef_dt, name):
    """
    AFM RDR tables: Several labels miscount bytes somewhere in the tables

    HITS
    * phoenix
        * afm_rdr
    """
    import pandas as pd

    if "HEADER_TABLE" in name:
        num_rows_skipped = 0
        num_rows = 4
    elif name == "AFM_F_ERROR_TABLE":
        num_rows_skipped = 4
        num_rows = 512
    elif name == "AFM_F_HEIGHT_TABLE":
        num_rows_skipped = 516
        num_rows = 512
    elif name == "AFM_B_ERROR_TABLE":
        num_rows_skipped = 1028
        num_rows = 512
    elif name == "AFM_B_HEIGHT_TABLE":
        num_rows_skipped = 1540
        num_rows = 512
    table = pd.read_csv(
        filename,
        header=None,
        sep=",",
        skiprows=num_rows_skipped, nrows=num_rows
    )
    names = [c for c in fmtdef_dt[0]['NAME'] if "PLACEHOLDER" not in c]
    assert len(table.columns) == len(names), "mismatched column count"
    table.columns = names
    return table

elec_em6_structure(block, name, filename, data, identifiers)

ELEC EDR em6/TBL tables: All the START_BYTEs in TBL_0_STATE_DATA.FMT are off by 36 bytes.

HITS * phoenix * elec_edr (partial)

Source code in pdr/formats/phoenix.py
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
def elec_em6_structure(block, name, filename, data, identifiers):
    """
    ELEC EDR em6/TBL tables: All the START_BYTEs in TBL_0_STATE_DATA.FMT
    are off by 36 bytes.

    HITS
    * phoenix
        * elec_edr (partial)
    """
    from pdr.pd_utils import insert_sample_types_into_df, compute_offsets
    from pdr.loaders.queries import read_table_structure
    fmtdef = read_table_structure(
        block, name, filename, data, identifiers
    )
    for line in range(0, len(fmtdef)):
        if fmtdef.at[line, "BLOCK_NAME"] == "TBL0 DATA":
            fmtdef.at[line, "START_BYTE"] -= 36
    fmtdef = compute_offsets(fmtdef)
    return insert_sample_types_into_df(fmtdef, identifiers)

led_edr_structure(block, name, filename, data, identifiers)

TEGA_LED.FMT: the CONTAINER's REPETITIONS should be 1000, not 1010

HITS * phoenix * lededr

Source code in pdr/formats/phoenix.py
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
def led_edr_structure(block, name, filename, data, identifiers):
    """
    TEGA_LED.FMT: the CONTAINER's REPETITIONS should be 1000, not 1010

    HITS
    * phoenix
        * lededr
    """
    from pdr.pd_utils import insert_sample_types_into_df, compute_offsets
    from pdr.loaders.queries import read_table_structure

    fmtdef = read_table_structure(
        block, name, filename, data, identifiers
    )
    real_repetitions = 1000
    real_fmtdef_len = 5 + (real_repetitions * 3)
    fmtdef = fmtdef.iloc[0:real_fmtdef_len, :]

    for line in range(0, len(fmtdef)):
        if fmtdef.at[line, "BLOCK_NAME"] == "LED_RECORDS":
            fmtdef.at[line, "BLOCK_REPETITIONS"] = 1000

    fmtdef = compute_offsets(fmtdef)
    return insert_sample_types_into_df(fmtdef, identifiers)

phxao_header_position(identifiers, block, target, name, start_byte)

PHXAO tables: Some table headers have lost trailing whitespace assumed to be present by the label. Treat as newline-delimited instead; the record count is correct.

HITS * phoenix * atm_phxao

Source code in pdr/formats/phoenix.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def phxao_header_position(identifiers, block, target, name, start_byte):
    """
    PHXAO tables: Some table headers have lost trailing whitespace
    assumed to be present by the label.  Treat as newline-delimited
    instead; the record count is correct.

    HITS
    * phoenix
       * atm_phxao
    """
    from pdr.loaders.queries import _extract_table_records
    return {
        "as_rows": True,
        "start": 0,
        "length": _extract_table_records(block),
    }

phxao_table_offset(filename, identifiers)

PHXAO tables: Some table headers have lost trailing whitespace assumed to be present by the label. Recalculate the table offset assuming that the table itself is still fixed-width.

HITS * phoenix * atm_phxao

Source code in pdr/formats/phoenix.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def phxao_table_offset(filename, identifiers):
    """
    PHXAO tables: Some table headers have lost trailing whitespace
    assumed to be present by the label.  Recalculate the table offset
    assuming that the table itself is still fixed-width.

    HITS
    * phoenix
       * atm_phxao
    """
    from pdr.loaders._helpers import count_from_bottom_of_file
    rows = identifiers["ROWS"]
    row_bytes = identifiers["ROW_BYTES"]
    start_byte = count_from_bottom_of_file(
        filename, rows, row_bytes=row_bytes
    )
    return True, start_byte

sc_rdr_structure(block, name, filename, data, identifiers)

TEGA_SCRDR.FMT: most of the START_BYTEs are off by 4 because column 2 ("TEGA_TIME") is actually 8 bytes, not 4

HITS * phoenix * scrdr

Source code in pdr/formats/phoenix.py
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
def sc_rdr_structure(block, name, filename, data, identifiers):
    """
    TEGA_SCRDR.FMT: most of the START_BYTEs are off by 4 because column 2 
    ("TEGA_TIME") is actually 8 bytes, not 4

    HITS
    * phoenix
        * scrdr
    """
    from pdr.pd_utils import insert_sample_types_into_df, compute_offsets
    from pdr.loaders.queries import read_table_structure

    fmtdef = read_table_structure(
        block, name, filename, data, identifiers
    )
    for line in range(0, len(fmtdef)):
        if fmtdef.at[line, "COLUMN_NUMBER"] == 2:
            fmtdef.at[line, "BYTES"] = 8
        if fmtdef.at[line, "COLUMN_NUMBER"] >= 3:
            fmtdef.at[line, "START_BYTE"] += 4

    fmtdef = compute_offsets(fmtdef)
    return insert_sample_types_into_df(fmtdef, identifiers)

wcl_edr_special_block(data, name)

WCL EDR ema/emb/emc tables: the START_BYTE for columns 13 and 14 are off by 1 and 2 bytes respectively. (The em8/em9/emf tables are fine.)

HITS * phoenix * wcl_edr (partial)

Source code in pdr/formats/phoenix.py
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
def wcl_edr_special_block(data, name):
    """
    WCL EDR ema/emb/emc tables: the START_BYTE for columns 13 and 14 are
    off by 1 and 2 bytes respectively. (The em8/em9/emf tables are fine.)

    HITS
    * phoenix
        * wcl_edr (partial)
    """
    block = data.metablock_(name)

    for item in iter(block.items()):
        if "COLUMN" in item:
            if item[1]["COLUMN_NUMBER"] == 13:
                item[1]["START_BYTE"] -= 1
            if item[1]["COLUMN_NUMBER"] == 14:
                item[1]["START_BYTE"] -= 2
    return block

wcl_rdr_offset(data, name)

WCL RDR CP/CV tables: in the labels, each pointer's start byte is missing '' even though the units are bytes rather than file_records. This doesn't fix the header table though, they still need attention.

Source code in pdr/formats/phoenix.py
144
145
146
147
148
149
150
def wcl_rdr_offset(data, name):
    """WCL RDR CP/CV tables: in the labels, each pointer's start byte is
    missing '<BYTES>' even though the units are bytes rather than file_records.
    This doesn't fix the header table though, they still need attention."""
    target = data.metaget_("^"+name)
    start_byte = target[-1] - 1
    return True, start_byte

formats.pvo

oims_12s_loader(data, name)

OIMS 12 second averages: all labels say 'ROWS = 42' regardless of the data's actual length

HITS * pvo * oims_12s

Source code in pdr/formats/pvo.py
19
20
21
22
23
24
25
26
27
28
29
30
def oims_12s_loader(data, name):
    """
    OIMS 12 second averages: all labels say 'ROWS = 42' regardless of the
    data's actual length

    HITS
    * pvo
        * oims_12s
    """
    block = data.metablock_(name)
    block["ROWS"] = data.metaget_("FILE_RECORDS")
    return block

orpa_low_res_loader(data, name)

ORPA low resolution: labels for earlier orbits have the correct ROW_BYTES, but there is a typo introduced later that says 'ROW_BYTES = 241' instead of 243

HITS * pvo * orpa_lowres

Source code in pdr/formats/pvo.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
def orpa_low_res_loader(data, name):
    """
    ORPA low resolution: labels for earlier orbits have the correct
    ROW_BYTES, but there is a typo introduced later that says 'ROW_BYTES =
    241' instead of 243

    HITS
    * pvo
        * orpa_lowres
    """
    block = data.metablock_(name)
    block["ROW_BYTES"] = 243
    return block

formats.rosetta

fix_pad_length_structure(block, name, filename, data, identifiers)

The MIDAS FSC tables and several CONSERT ptypes have ROW_PREFIX_BYTES, ROW_SUFFIX_BYTES, and a COLUMN with multiple ITEMS. compute_offsets() calculates the wrong end_byte and pad_length values from the BYTES and ROW_BYTES values in their labels.

HITS * rosetta_consert * l2_land * l2_orbit * l3_land * l3_land_fss * l3_orbit * l3_orbit_fss * l4_land * l4_orbit * l4_orbit_grnd * rosetta_dust * RDR_midas_fsc

Source code in pdr/formats/rosetta.py
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def fix_pad_length_structure(block, name, filename, data, identifiers):
    """
    The MIDAS FSC tables and several CONSERT ptypes have ROW_PREFIX_BYTES, 
    ROW_SUFFIX_BYTES, and a COLUMN with multiple ITEMS. compute_offsets() 
    calculates the wrong end_byte and pad_length values from the BYTES and 
    ROW_BYTES values in their labels.

    HITS
    * rosetta_consert
        * l2_land
        * l2_orbit
        * l3_land
        * l3_land_fss
        * l3_orbit
        * l3_orbit_fss
        * l4_land
        * l4_orbit
        * l4_orbit_grnd
    * rosetta_dust
        * RDR_midas_fsc
    """
    from pdr.pd_utils import insert_sample_types_into_df, compute_offsets
    from pdr.loaders.queries import read_table_structure
    fmtdef = read_table_structure(
        block, name, filename, data, identifiers
    )
    for end in ("_PREFIX", "_SUFFIX", ""):
        length = block.get(f"ROW{end}_BYTES")
        if length is not None:
            fmtdef[f"ROW{end}_BYTES"] = length

    # to calculate end_byte correctly in compute_offsets()
    fmtdef["BYTES"] = fmtdef["ITEM_BYTES"]
    # to calculate pad_length correctly in compute_offsets()
    fmtdef["ROW_BYTES"] = fmtdef["ROW_BYTES"] + fmtdef["ROW_PREFIX_BYTES"]

    fmtdef = compute_offsets(fmtdef)
    return insert_sample_types_into_df(fmtdef, identifiers)

midas_rdr_sps_structure(block, name, filename, data, identifiers)

SPS TIME_SERIES tables are made up of a repeated container with 4 columns followed by a non-repeated checksum column. In compute_offsets() the block_names list ends up out of order, so SB_OFFSET is not calculated correctly for columns in the repeated CONTAINER.

TODO: This seems like a more general issue with how compute_offsets() handles a repeated container followed by a single column

HITS * rosetta_dust * RDR_midas_sps

Source code in pdr/formats/rosetta.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
def midas_rdr_sps_structure(block, name, filename, data, identifiers):
    """
    SPS TIME_SERIES tables are made up of a repeated container with 4 columns 
    followed by a non-repeated checksum column. In compute_offsets() the 
    `block_names` list ends up out of order, so SB_OFFSET is not calculated 
    correctly for columns in the repeated CONTAINER.

    TODO: This seems like a more general issue with how compute_offsets() 
    handles a repeated container followed by a single column

    HITS
    * rosetta_dust
        * RDR_midas_sps
    """
    from pdr.pd_utils import insert_sample_types_into_df, compute_offsets
    from pdr.loaders.queries import read_table_structure
    import pandas as pd

    fmtdef = read_table_structure(
        block, name, filename, data, identifiers
    )
    for end in ("_PREFIX", "_SUFFIX", ""):
        length = block.get(f"ROW{end}_BYTES")
        if length is not None:
            fmtdef[f"ROW{end}_BYTES"] = length

    # Add a placeholder row to the start of the fmtdef so that the 
    # "block_names" list in compute_offsets() is in the right order and 
    # SB_OFFSET is calculated correctly
    placeholder_row = {
        "NAME": "PLACEHOLDER_block",
        "DATA_TYPE": "VOID",
        "BYTES": 0,
        "START_BYTE": 1,
        "BLOCK_REPETITIONS": 1,
        "BLOCK_NAME": "CONTROL_DATA", # matches the checksum column's BLOCK_NAME
        "ROW_PREFIX_BYTES": 46,
    }
    fmtdef = pd.concat(
        [pd.DataFrame([placeholder_row]), fmtdef]
    ).reset_index(drop=True)

    fmtdef = compute_offsets(fmtdef)
    return insert_sample_types_into_df(fmtdef, identifiers)

rosetta_table_loader(filename, fmtdef_dt)

HITS * rosetta_rpc * RPCMIP

Source code in pdr/formats/rosetta.py
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
def rosetta_table_loader(filename, fmtdef_dt):
    """
    HITS
    * rosetta_rpc
        * RPCMIP
    """
    import astropy.io.ascii

    table = astropy.io.ascii.read(filename).to_pandas()
    fmtdef, dt = fmtdef_dt
    table.columns = fmtdef["NAME"].to_list()
    return table

formats.saturn_rpx

rpx_img_hdu_start_byte(name, hdulist)

The multiple *_IMAGE pointers in these files all point at the same FITS HDU (each pointer illegally represents one band of the image).

HITS * saturn_rpx * hst_raw_img * hst_raw_mask * hst_cal_img * hst_cal_mask * hst_eng_data * hst_eng_mask

Source code in pdr/formats/saturn_rpx.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
def rpx_img_hdu_start_byte(name, hdulist):
    """
    The multiple *_IMAGE pointers in these files all point at the same FITS
    HDU (each pointer illegally represents one band of the image).

    HITS
    * saturn_rpx
        * hst_raw_img
        * hst_raw_mask
        * hst_cal_img
        * hst_cal_mask
        * hst_eng_data
        * hst_eng_mask
    """
    if 'HEADER' in name:
        return 0
    return hdulist.fileinfo(0)['datLoc']

formats.themis

check_gzip_fn(data, object_name)

Some THEMIS QUBEs are stored in gzipped formats. The labels do not always bother to mention this.

HITS * themis * BTR * ABR * PBT_v1 * PBT_v2 * ALB_v2 * ir_GEO_v2 * vis_GEO_v2 * ir_EDR * vis_EDR * vis_RDR

Source code in pdr/formats/themis.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def check_gzip_fn(data, object_name):
    """
    Some THEMIS QUBEs are stored in gzipped formats. The labels do not always
    bother to mention this.

    HITS
    * themis
        * BTR
        * ABR
        * PBT_v1
        * PBT_v2
        * ALB_v2
        * ir_GEO_v2
        * vis_GEO_v2
        * ir_EDR
        * vis_EDR
        * vis_RDR
    """
    target = data.metaget(pointerize(object_name))
    if isinstance(target, (dict, int)):
        return False, None
    filename = listify(target)[0]
    if filename.endswith("gz"):
        return filename
    return True, [filename, f"{filename}.gz"]

get_qube_offset(data)

some THEMIS QUBEs mis-specify file records.

HITS * themis * ir_GEO_v2 * vis_GEO_v2

Source code in pdr/formats/themis.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def get_qube_offset(data):
    """
    some THEMIS QUBEs mis-specify file records.

    HITS
    * themis
        * ir_GEO_v2
        * vis_GEO_v2
    """
    if (
        data.metaget_("FILE_RECORDS")
        >= os.stat(data.file_mapping["QUBE"]).st_size
    ):
        return True, data.metaget_("^QUBE")[-1] - 1
    return False, None

get_visgeo_qube_offset(data)

Source code in pdr/formats/themis.py
11
12
13
def get_visgeo_qube_offset(data):
    """"""
    return True, data.metaget_("^QUBE")[1] - 1

trivial_themis_geo_loader(pointer)

HITS * themis * ir_GEO_v2 * vis_GEO_v2

Source code in pdr/formats/themis.py
16
17
18
19
20
21
22
23
24
def trivial_themis_geo_loader(pointer):
    """
    HITS
    * themis
        * ir_GEO_v2
        * vis_GEO_v2
    """
    warnings.warn(f"THEMIS {pointer} objects are not currently supported.")
    return True

formats.ulysses

gas_table_loader(filename, fmtdef_dt)

GASDATA.FMT has the wrong START_BYTE for columns in the container. After manually changing the labels during testing, START_BYTE was still not incrementing correctly with each repetition of the container. This fixes both issues with 1 special case.

HITS * ulysses * gas

Source code in pdr/formats/ulysses.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
def gas_table_loader(filename, fmtdef_dt):
    """
    GASDATA.FMT has the wrong START_BYTE for columns in the container.
    After manually changing the labels during testing, START_BYTE was still
    not incrementing correctly with each repetition of the container.
    This fixes both issues with 1 special case.

    HITS
    * ulysses
        * gas
    """
    import pandas as pd
    fmtdef, dt = fmtdef_dt
    # Some tables use tabs as column deliminators, others use spaces.
    table = pd.read_csv(filename, skiprows=17, sep=r"\s+", header=None)
    assert len(table.columns) == len(fmtdef.NAME.tolist())
    table.columns = fmtdef.NAME.tolist()
    return table

get_sample_type(base_samp_info)

The bit column's data_type is BIT_STRING, which throws errors. Guessing this should be MSB_BIT_STRING. The tables look correct when compared to their ASCII versions.

HITS * ulysses * epac_pha_bin

Source code in pdr/formats/ulysses.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def get_sample_type(base_samp_info):
    """
    The bit column's data_type is BIT_STRING, which throws errors. Guessing
    this should be MSB_BIT_STRING. The tables look correct when compared to
    their ASCII versions.

    HITS
    * ulysses
        * epac_pha_bin
    """
    from pdr.datatypes import sample_types
    sample_type = base_samp_info["SAMPLE_TYPE"]
    sample_bytes = base_samp_info["BYTES_PER_PIXEL"]

    if "BIT_STRING" == sample_type:
        sample_type = "MSB_BIT_STRING"
        return True, sample_types(
            sample_type, int(sample_bytes), for_numpy=True
        )
    return False, None

get_special_block(data, name, identifiers)

START_BYTE is wrong for repeated columns within the container. ITEM_BYTES is also off by 1.

HITS * ulysses * epac_all_chan * epac_omni_ele * epac_omni_pro * epac_pha_asc * epac_pha_bin * epac_prtl * epac_pstl

Source code in pdr/formats/ulysses.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def get_special_block(data, name, identifiers):
    """
    START_BYTE is wrong for repeated columns within the container. ITEM_BYTES
    is also off by 1.

    HITS
    * ulysses
        * epac_all_chan
        * epac_omni_ele
        * epac_omni_pro
        * epac_pha_asc
        * epac_pha_bin
        * epac_prtl
        * epac_pstl
    """
    block = data.metablock_(name)
    if "ULY-J-EPAC-4-SUMM-PSTL" in identifiers["DATA_SET_ID"]:
        block["CONTAINER"]["COLUMN"]["ITEM_BYTES"] = 13
        block["CONTAINER"]["COLUMN"]["START_BYTE"] = 1
    elif "ULY-J-EPAC-4-SUMM-ALL-CHAN" in identifiers["DATA_SET_ID"]:
        block.getall('CONTAINER')[0]['COLUMN']['START_BYTE'] = 1
        block.getall('CONTAINER')[1]['CONTAINER']['START_BYTE'] = 1
        block.getall('CONTAINER')[1]['CONTAINER']['COLUMN']['START_BYTE'] = 1
    return block

formats.vega

fix_array_structure(name, block, fn, data, identifiers)

HITS

  • giotto
    • pia
  • vega
    • puma_mode
Source code in pdr/formats/vega.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def fix_array_structure(name, block, fn, data, identifiers):
    """
    HITS

    * giotto
        * pia
    * vega
        * puma_mode
    """
    from pdr.datatypes import sample_types
    from pdr.loaders.queries import read_table_structure, \
        check_array_for_subobject

    if not block.get("INTERCHANGE_FORMAT") == "BINARY":
        return None, None
    has_sub = check_array_for_subobject(block)
    if not has_sub:
        dt = sample_types(block["DATA_TYPE"], block["BYTES"], True)
        return None, dt
    fmtdef = read_table_structure(block, name, fn, data, identifiers)
    specbytes = block.get("COLLECTION").get("BYTES")
    specstart = fmtdef.loc[
        fmtdef['NAME'] == 'PLACEHOLDER_SPECTRUM', "START_BYTE"
    ].iloc[0]
    fmtdef.loc[fmtdef['NAME'] == 'PLACEHOLDER_SPECTRUM', "AXIS_ITEMS"] = (
        (specbytes - specstart + 1)
        / len(fmtdef.loc[fmtdef['BLOCK_NAME'].str.contains('SPECTRUM')])
    )
    fmtdef.loc[fmtdef['NAME'] == 'PLACEHOLDER_SPECTRUM', 'NAME'] = 'SPECTRUM'
    # this special flow leaves in all the placeholder columns, so cut them
    fmtdef = fmtdef.loc[
        ~fmtdef['NAME'].str.startswith('PLACEHOLDER')
    ].copy().reset_index(drop=True)
    # Sometimes arrays define start_byte, sometimes their elements do
    if "START_BYTE" in fmtdef.columns:
        fmtdef['START_BYTE'] = fmtdef['START_BYTE'].fillna(1)
    from pdr.pd_utils import compute_offsets, insert_sample_types_into_df
    return insert_sample_types_into_df(compute_offsets(fmtdef), identifiers)

get_structure(block, name, filename, data, identifiers)

"Encounter data" tables miscount the last column's START_BYTE by 1

HITS * vega * ducma

Source code in pdr/formats/vega.py
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
def get_structure(block, name, filename, data, identifiers):
    """
    "Encounter data" tables miscount the last column's START_BYTE by 1

    HITS
    * vega
        * ducma
    """
    from pdr.loaders.queries import read_table_structure
    fmtdef = read_table_structure(
        block, name, filename, data, identifiers
    )

    if "encounter data" in block['DESCRIPTION']:
        fmtdef.at[10, "START_BYTE"] = 62
    return fmtdef, None

formats.viking

seis_table_loader(filepath, fmtdef_dt)

The Viking 2 seismometer tables have mangled labels. The raw data tables are variable length CSVs, and labels for the summary tables count column bytes wrong. Half the labels define columns that do not match the data.

HITS * viking * seis_raw * seis_summary

Source code in pdr/formats/viking.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
def seis_table_loader(filepath, fmtdef_dt):
    """
    The Viking 2 seismometer tables have mangled labels. The raw data tables
    are variable length CSVs, and labels for the summary tables count column
    bytes wrong. Half the labels define columns that do not match the data.

    HITS
    * viking
        * seis_raw
        * seis_summary
    """
    import pandas as pd

    col_names = [c for c in fmtdef_dt[0].NAME if "PLACEHOLDER" not in c]
    filename = filepath.split("/")[-1]
    # The summary tables have miscounted bytes in their labels. The columns are
    # separated by whitespace, so can be read by read_csv() instead. Also, both
    # labels define a SEISMIC_TIME_SOLS column that doesn't exist in the data.
    if "summary" in filename.lower():
        table = pd.read_csv(filepath, header=None, sep=r"\s+")
        col_names.remove("SEISMIC_TIME_SOLS")
        if "event_wind_summary" in filename.lower():
            # event_wind_summary.tab has a column not included in the label. It
            # is listed in: https://pds-geosciences.wustl.edu/viking/vl2-m-seis-5-rdr-v1/vl_9020/document/vpds_event_winds_format.txt
            col_names.insert(7, "ORIGINAL_LINES_COUNT")
    # The raw event tables are variable-length CSVs. Their labels include a
    # SEISMIC_SOL column that doesn't exist in the data.
    elif "event" in filename.lower():
        table = pd.read_csv(filepath, header=None, sep=",")
        col_names.remove("SEISMIC_SOL")
    # The raw high-rate tables are variable-length CSVs. Their labels list the
    # correct number of columns.
    elif "high" in filename.lower():
        table = pd.read_csv(filepath, header=None, sep=",")
    else:
        raise ValueError("Unknown Viking 2 SEIS table format.")
    assert len(table.columns) == len(col_names), "mismatched column count"
    table.columns = col_names
    return table

formats.voyager

get_fn(data)

Some of the PPS Jitter tables' SERIES pointers have the wrong filename.

HITS: * vg_ring_profiles * pps_jitter

Source code in pdr/formats/voyager.py
157
158
159
160
161
162
163
164
165
166
167
168
def get_fn(data):
    """
    Some of the PPS Jitter tables' SERIES pointers have the wrong filename.

    HITS:
    * vg_ring_profiles
        * pps_jitter
    """
    from pathlib import Path

    label = Path(data.labelname)
    return True, Path(label.parent, f"{label.stem}.TAB")

get_structure(block, name, filename, data, identifiers)

The VGR_PLS_HR_2017.FMT for PLS 1-hour averages undercounts the last column by 1 byte.

HITS * vg_pls * sys_1hr_avg (partial)

Source code in pdr/formats/voyager.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
def get_structure(block, name, filename, data, identifiers):
    """
    The VGR_PLS_HR_2017.FMT for PLS 1-hour averages undercounts the last column
    by 1 byte.

    HITS
    * vg_pls
        * sys_1hr_avg (partial)
    """
    from pdr.loaders.queries import read_table_structure
    fmtdef = read_table_structure(
        block, name, filename, data, identifiers
    )
    fmtdef.at[8, "BYTES"] = 6
    return fmtdef, None

lecp_table_loader(filename, fmtdef_dt)

VG1 LECP Jupiter SUMM Sector tables reference a format file with incorrect START_BYTEs for columns within a CONTAINER. Columns are consistently separated by whitespace. The VG2 Uranus 12.8 minute step table (ascii version) was missing values from some rows, not sure why. Reusing this special case fixes it.

HITS vg_lecp * j_summ_sector_vg1 * u_rdr_step_12.8 (partial)

Source code in pdr/formats/voyager.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
def lecp_table_loader(filename, fmtdef_dt):
    """
    VG1 LECP Jupiter SUMM Sector tables reference a format file with incorrect
    START_BYTEs for columns within a CONTAINER. Columns are consistently
    separated by whitespace.
    The VG2 Uranus 12.8 minute step table (ascii version) was missing values 
    from some rows, not sure why. Reusing this special case fixes it.

    HITS
    vg_lecp
        * j_summ_sector_vg1
        * u_rdr_step_12.8 (partial)
    """
    import pandas as pd

    fmtdef, dt = fmtdef_dt
    table = pd.read_csv(filename, header=None, sep=r"\s+")

    col_names = [c for c in fmtdef_dt[0]['NAME'] if "PLACEHOLDER" not in c]
    assert len(table.columns) == len(col_names), "mismatched column count"
    table.columns = col_names
    return table

lecp_vg1_sat_table_loader(filename, fmtdef_dt)

VG1 Saturn RDR step products have an extra header row partway through their tables. This special case skips those rows by treating them as comments. PDS volume affected: VG1-S-LECP-3-RDR-STEP-6MIN-V1.0

HITS vg_lecp * s_rdr_step (partial)

Source code in pdr/formats/voyager.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
def lecp_vg1_sat_table_loader(filename, fmtdef_dt):
    """
    VG1 Saturn RDR step products have an extra header row partway through their 
    tables. This special case skips those rows by treating them as comments. 
    PDS volume affected: VG1-S-LECP-3-RDR-STEP-6MIN-V1.0

    HITS
    vg_lecp
        * s_rdr_step (partial)
    """
    import pandas as pd

    fmtdef, dt = fmtdef_dt
    # Rows that start with "VOYAGER" are extra headers. "comment='V'" skips them
    table = pd.read_csv(filename, comment='V')

    col_names = [c for c in fmtdef_dt[0]['NAME'] if "PLACEHOLDER" not in c]
    assert len(table.columns) == len(col_names), "mismatched column count"
    table.columns = col_names
    return table

mag_special_block(data, name)

ROW_BYTES are listed as 144 in the labels for Uranus and Neptune MAG RDRs. Their tables look the same, but the Neptune products open wrong. Setting ROW_BYTES to 145 fixes it.

HITS * vg_mag * rdr_nep

Source code in pdr/formats/voyager.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
def mag_special_block(data, name):
    """
    ROW_BYTES are listed as 144 in the labels for Uranus and Neptune MAG RDRs.
    Their tables look the same, but the Neptune products open wrong. Setting
    ROW_BYTES to 145 fixes it.

    HITS
    * vg_mag
        * rdr_nep
    """
    block = data.metablock_(name)
    block["ROW_BYTES"] = 145 
    return block

pls_avg_special_block(data, name)

Because VGR_PLS_HR_2017.FMT undercounts by 1 byte, the products that reference it also undercount their ROW_BYTES by 1.

HITS * vg_pls * sys_1hr_avg

Source code in pdr/formats/voyager.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def pls_avg_special_block(data, name):
    """
    Because VGR_PLS_HR_2017.FMT undercounts by 1 byte, the products that
    reference it also undercount their ROW_BYTES by 1.

    HITS
    * vg_pls
        * sys_1hr_avg
    """
    block = data.metablock_(name)
    if block["^STRUCTURE"] == "VGR_PLS_HR_2017.FMT":
        block["ROW_BYTES"] = 57 
        return True, block
    return False, None

pls_fine_special_block(data, name)

Most of the PLS FINE RES labels undercount the ROW_BYTES. The most recent product (2007-241_2018-309) is formatted differently and opens correctly.

HITS * vg_pls * sys_fine_res

Source code in pdr/formats/voyager.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def pls_fine_special_block(data, name):
    """
    Most of the PLS FINE RES labels undercount the ROW_BYTES. The most recent
    product (2007-241_2018-309) is formatted differently and opens correctly.

    HITS
    * vg_pls
        * sys_fine_res
    """
    block = data.metablock_(name)
    if block["ROW_BYTES"] == 57:
        block["ROW_BYTES"] = 64 
        return True, block
    return False, None

pls_ionbr_special_block(data, name)

SUMRY.LBL references the wrong format file

HITS * vg_pls * ur_ionbr (partial)

Source code in pdr/formats/voyager.py
68
69
70
71
72
73
74
75
76
77
78
def pls_ionbr_special_block(data, name):
    """
    SUMRY.LBL references the wrong format file

    HITS
    * vg_pls
        * ur_ionbr (partial)
    """
    block = data.metablock_(name)
    block["^STRUCTURE"] = "SUMRY.FMT" 
    return True, block

pra_special_block(data, name, identifiers)

PRA Lowband RDRs: The Jupiter labels use the wrong START_BYTE for columns in containers. The Saturn/Uranus/Neptune labels define columns with multiple ITEMS, but ITEM_BYTES is missing and the BYTES value is wrong.

HITS * vg_pra * lowband_jup * lowband_other

Source code in pdr/formats/voyager.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def pra_special_block(data, name, identifiers):
    """
    PRA Lowband RDRs: The Jupiter labels use the wrong START_BYTE for columns
    in containers. The Saturn/Uranus/Neptune labels define columns with
    multiple ITEMS, but ITEM_BYTES is missing and the BYTES value is wrong.

    HITS
    * vg_pra
        * lowband_jup
        * lowband_other
    """
    block = data.metablock_(name)
    if identifiers["DATA_SET_ID"] in (
        "VG2-S-PRA-3-RDR-LOWBAND-6SEC-V1.0",
        "VG2-N-PRA-3-RDR-LOWBAND-6SEC-V1.0",
        "VG2-U-PRA-3-RDR-LOWBAND-6SEC-V1.0"
    ):
        for item in iter(block.items()):
            if "COLUMN" in item and "SWEEP" in item[1]["NAME"]:
                item[1].add("ITEM_BYTES", 4)  # The original BYTES value
                item[1]["BYTES"] = 284  # ITEM_BYTES * ITEMS
    elif identifiers["DATA_SET_ID"] == "VG2-J-PRA-3-RDR-LOWBAND-6SEC-V1.0":
        for item in iter(block["CONTAINER"].items()):
            if "COLUMN" in item:
                if item[1]["NAME"] == "STATUS_WORD":
                    item[1]["START_BYTE"] = 1
                if item[1]["NAME"] == "DATA_CHANNELS":
                    item[1]["START_BYTE"] = 5
    return True, block

func

call_kwargfiltered(func: Callable, *args, **kwargs) -> Any

call a function, filtering out any keyword arguments it doesn't actually accept. intended to help unify signatures to call functions in a dispatched or sequenced fashion. NOTE: will not fix attempts to pass positional-only arguments by name.

Source code in pdr/func.py
86
87
88
89
90
91
92
93
94
def call_kwargfiltered(func: Callable, *args, **kwargs) -> Any:
    """
    call a function, filtering out any keyword arguments it doesn't actually
    accept. intended to help unify signatures to call functions in a
    dispatched or sequenced fashion. NOTE: will not fix attempts to pass
    positional-only arguments by name.
    """
    # TODO: Maybe rewrite as decorator
    return func(*args, **filterkwargs(func, kwargs))

filterkwargs(func: Callable, kwargdict: Mapping[str, Any]) -> dict[str, Any]

return a copy of kwargdict, discarding all keys that are not argument names of func.

Source code in pdr/func.py
76
77
78
79
80
81
82
83
def filterkwargs(
    func: Callable, kwargdict: Mapping[str, Any]
) -> dict[str, Any]:
    """
    return a copy of kwargdict, discarding all keys that are not argument
    names of func.
    """
    return keyfilter(lambda k: k in get_argnames(func), kwargdict)

get_all_argnames(*funcs: Callable, nonoptional=False) -> set[str]

return all parameter names found in the signatures of funcs. if nonoptional is True, don't include parameters marked as optional according to the conventions of this module, meaning that any of the following are true:

1. string representation of their annotation begins with "Optional"
2. string representation of their annotation ends with "| None"
   or begins with "None |"
3. they are named _ or __
Source code in pdr/func.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def get_all_argnames(*funcs: Callable, nonoptional=False) -> set[str]:
    """
    return all parameter names found in the signatures of funcs. if nonoptional
    is True, don't include parameters marked as optional according to the
    conventions of this module, meaning that any of the following are true:

        1. string representation of their annotation begins with "Optional"
        2. string representation of their annotation ends with "| None"
           or begins with "None |"
        3. they are named _ or __
    """
    if nonoptional is True:
        return reduce(set.union, map(get_non_optional_argnames, funcs))
    return reduce(set.union, map(get_argnames, funcs))

get_argnames(func: Callable) -> set[str]

return names of all parameters of a function

Source code in pdr/func.py
15
16
17
def get_argnames(func: Callable) -> set[str]:
    """return names of all parameters of a function"""
    return set(signature(func).parameters.keys())

get_non_optional_argnames(func: Callable) -> set[str]

determine names of arguments a function must receive by filtering out arguments explicitly annotated as Optional or named "" or "__". Note that "nonoptional" here describes a _convention of this module, not a Python typing requirement.

Source code in pdr/func.py
48
49
50
51
52
53
54
55
56
def get_non_optional_argnames(func: Callable) -> set[str]:
    """
    determine names of arguments a function must receive by filtering out
    arguments explicitly annotated as Optional or named "_" or "__". Note that
    "nonoptional" here describes a _convention of this module_, not a Python
    typing requirement.
    """
    sig_dict = valfilter(not_optional, dict(signature(func).parameters))
    return set(sig_dict.keys())

not_optional(param: Parameter) -> bool

is this Parameter flagged as not required according to the conventions of this module?

Source code in pdr/func.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def not_optional(param: Parameter) -> bool:
    """
    is this Parameter flagged as not required according to the conventions of
    this module?
    """
    # TODO, maybe: this is even sketchier than it was before, but
    #  is the most expedient way to make it compatible with 3.14 and < 3.14.
    #  There's probably some hellish regex that makes it generally safe, but
    #  I'm not going to consider all the edge cases, because (1) there are so
    #  many ways to write type annotations, (2) their string representations
    #  aren't WYSIWYG, and (3) this is really only intended for
    #  library-internal use and doesn't need to support the entire type
    #  annotation system.

    pstr = re.split("[:=]", str(param))
    pstr = pstr[0] if len(pstr) == 1 else pstr[1]
    pstr = pstr.strip(" '")
    if (
        pstr.startswith("Optional[")
        or pstr.endswith(" | None")
        or pstr.startswith("None | ")
    ):
        return False
    if param.name in ("_", "__"):
        return False
    return True

paramsort(params: Collection[Parameter]) -> list[Parameter]

sorts signature parameters into legal order

Source code in pdr/func.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def paramsort(params: Collection[Parameter]) -> list[Parameter]:
    """sorts signature parameters into legal order"""
    bins = {
        'POSITIONAL_ONLY': [],
        'POSITIONAL_OR_KEYWORD': [],
        'VAR_POSITIONAL': [],
        'KEYWORD_ONLY': [],
        'VAR_KEYWORD': []
    }
    for p in params:
        bins[p.kind.name].append(p)

    # then put Optional variables at the end of each bin
    for kind, paramlist in bins.items():
        bins[kind] = sorted(
            paramlist,
            key=lambda p: not_optional(p) is False
        )

    # noinspection PyTypeChecker
    return list(chain.from_iterable(bins.values()))

sig_union(*funcs: Callable) -> Signature

examine multiple functions and produce a Signature object describing the union of the parameters of all functions -- i.e., the expected signature of a function that routes all its arguments to the appropriate elements of funcs and calls them in a dispatched, sequenced, or parallel fashion, rather than composed)

Source code in pdr/func.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
def sig_union(*funcs: Callable) -> Signature:
    """
    examine multiple functions and produce a Signature object describing the
    union of the parameters of all functions  -- i.e., the expected
    signature of a function that routes all its arguments to the appropriate
    elements of funcs and calls them in a dispatched, sequenced, or parallel
    fashion, rather than composed)
    """
    params = reduce(set.union, map(sigparams, funcs))
    outparams = set(p for p in params)
    for p1, p2 in combinations(params, r=2):
        # filter duplicate parameter names caused by mismatched type
        # annotations (other causes of mismatches indicate real problems)
        try:
            if p1.name != p2.name:
                continue
            if (p1._annotation == _empty) and (p2._annotation != _empty):
                outparams.remove(p1)
            elif (p1._annotation != _empty) and (p2._annotation == _empty):
                outparams.remove(p2)
            elif (p1._annotation == _empty) and (p2._annotation == _empty):
                outparams.remove(p2)
            elif p1._annotation != p2._annotation:
                raise TypeError(
                    f"{p1.name} and {p2.name} have different annotations in "
                    f"some of {funcs}, suggesting possible type mismatch."
                )
            else:
                outparams.remove(p2)
        except KeyError:
            # we already removed it
            continue
    return Signature(paramsort(outparams))

sigparams(func: Callable) -> set[Parameter]

examine a function and extract a set of inspect.Parameter objects from its signature

Source code in pdr/func.py
 97
 98
 99
100
101
102
def sigparams(func: Callable) -> set[Parameter]:
    """
    examine a function and extract a set of inspect.Parameter objects from its
    signature
    """
    return set(signature(func).parameters.values())

softquery(func: Callable, querydict: Mapping[str, Callable], kwargdict: dict[str, Any]) -> dict[str, Any]

implements a pipeline that accumulates 'information' -- more literally a dictionary of named parameters (kwargdict). querydict describes the sequence of functions to call and the parameter names they will populate in kwargdict. a function in querydict may use information gathered by preceding functions or passed explicitly to softquery in kwargdict, so long as the keys of kwargdict / querydict correspond to the parameter names of that function.

Source code in pdr/func.py
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
def softquery(
    func: Callable,
    querydict: Mapping[str, Callable],
    kwargdict: dict[str, Any],
) -> dict[str, Any]:
    """
    implements a pipeline that accumulates 'information' -- more literally a
    dictionary of named parameters (kwargdict). querydict describes the
    sequence of functions to call and the parameter names they will populate
    in kwargdict. a function in querydict may use information gathered by
    preceding functions or passed explicitly to softquery in kwargdict,
    so long as the keys of kwargdict / querydict correspond to the parameter
    names of that function.
    """
    # explanatory variables
    have_args = kwargdict.keys()
    require_args = get_all_argnames(
        func, *querydict.values(), nonoptional=True
    )
    args_to_get = require_args.difference(have_args)
    missing = args_to_get.difference(querydict)
    if len(missing) > 0:
        raise TypeError(f"Missing args in querydict: {missing}")
    for qname in querydict:
        if qname not in args_to_get.intersection(querydict):
            continue
        if "tracker" in kwargdict.keys():
            kwargdict["tracker"].track(querydict[qname])
        kwargdict[qname] = call_kwargfiltered(querydict[qname], **kwargdict)
    return kwargdict

specialize(func: Callable, check: Callable[..., tuple[bool, Any]], error: Optional[Callable[[Exception], str]] = None, tracker: TrivialTracker = TrivialTracker()) -> Callable

function decorator that permits dispatch of calls to func to an arbitrary set of special-case functions defined in check. replaces the pre-1.0 pdr special case checks.

Source code in pdr/func.py
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
def specialize(
    func: Callable,
    check: Callable[..., tuple[bool, Any]],
    error: Optional[Callable[[Exception], str]] = None,
    tracker: TrivialTracker = TrivialTracker(),
) -> Callable:
    """
    function decorator that permits dispatch of calls to func to an arbitrary
    set of special-case functions defined in check.
    replaces the pre-1.0 pdr special case checks.
    """

    @wraps(func)
    def preempt_if_special(*args, **kwargs):
        try:
            # TODO: if we want to catch the _name_ of the special case at this
            #  level, we need to change the default signature of special case
            #  checks to return the name of the special case, or do more
            #  digging into deeper levels than I like
            is_special, special_result = call_kwargfiltered(
                check, *args, **kwargs
            )
            tracker.track(check, is_special=is_special)
            if is_special is True:
                return special_result
            return call_kwargfiltered(func, *args, **kwargs)
        except Exception as ex:
            if error is None:
                raise
            return error(ex)

    preempt_if_special.__signature__ = sig_union(func, check)
    return preempt_if_special

loaders

loaders._helpers

Simple utility functions for assorted loaders and queries.

HETERODOX_ENDING = re.compile('\\r\\n?') module-attribute

Pattern for heterodox but not deeply bizarre line endings.

_cle = curry(re.sub, HETERODOX_ENDING, '\n') module-attribute

partially evaluated replacer of heterodox with orthodox line endings.

_check_delimiter_stream(identifiers: DataIdentifiers, name: str, target: PhysicalTarget, block: MultiDict) -> bool

Does it look like this object is a delimiter-separated table without an explicitly-defined row length?

Source code in pdr/loaders/_helpers.py
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
def _check_delimiter_stream(
    identifiers: DataIdentifiers,
    name: str,
    target: PhysicalTarget,
    block: MultiDict,
) -> bool:
    """
    Does it look like this object is a delimiter-separated table without an
    explicitly-defined row length?
    """
    # TODO: this may be deprecated. assess against notionally-supported
    #  products.
    if isinstance(target, dict):
        if target.get("units") == "BYTES":
            return False
    # TODO: untangle this, everywhere
    if isinstance(target, (list, tuple)):
        if isinstance(target[-1], dict):
            if target[-1].get("units") == "BYTES":
                return False
    # TODO: Other criteria that could appear in the block?
    if "BYTES" in block:
        return False
    # TODO: not sure this is a good assumption -- it is a bad assumption
    #  for the CHEMIN RDRs, but those labels are just wrong
    if identifiers["RECORD_BYTES"] not in (None, ""):
        return False
    # TODO: not sure this is a good assumption
    if not identifiers["RECORD_TYPE"] == "STREAM":
        return False
    # Well-known object types that imply textuality, if we have nothing
    # else to go on
    if any(label in name for label in ("ASCII", "SPREADSHEET", "HEADER")):
        return True
    return False

canonicalize_line_endings(text: Any) -> Any

Attempt to replace common 'heterodox' line endings in a string or
list/tuple of strings with canonical endings (

). Does not attempt to perform sophisticated delimiter sniffing, and will only reliably handle only and endings, not

, EM / 0x19,

, etc. Ignores (returns unchanged) non-strings and non-string elements of lists/tuples.

Source code in pdr/loaders/_helpers.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def canonicalize_line_endings(text: Any) -> Any:
    """
    Attempt to replace common 'heterodox' line endings in a string or
    list/tuple of strings with canonical endings (\n). Does not attempt to
    perform sophisticated delimiter sniffing, and will only reliably handle
    only \r and \r\n endings, not \n\r, EM / 0x19, \r\r\n, etc.
    Ignores (returns unchanged) non-strings and non-string elements of
    lists/tuples.
    """
    if isinstance(text, str):
        return _cle(text)
    if isinstance(text, (list, tuple)):
        return type(text)([_cle(s) if isinstance(s, str) else s for s in text])
    return text

canonicalized(func: Callable) -> Callable

Creates a version of func that canonicalizes line endings of any string (or top-level string elements of a list/tuple), returned by func

Source code in pdr/loaders/_helpers.py
138
139
140
141
142
143
144
145
146
147
148
def canonicalized(func: Callable) -> Callable:
    """
    Creates a version of `func` that canonicalizes line endings of any string
    (or top-level string elements of a list/tuple), returned by `func`
    """

    @wraps(func)
    def with_canonical_endings(*args, **kwargs):
        return canonicalize_line_endings(func(*args, **kwargs))

    return with_canonical_endings

check_explicit_delimiter(block: MultiDict) -> str

Check if an ASCII TABLE/SPREADSHEET definition explicitly gives a field delimiter. If it doesn't, tentatively assume it's comma-separated.

Source code in pdr/loaders/_helpers.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def check_explicit_delimiter(block: MultiDict) -> str:
    """
    Check if an ASCII TABLE/SPREADSHEET definition explicitly gives a field
    delimiter. If it doesn't, tentatively assume it's comma-separated.
    """
    if "FIELD_DELIMITER" in block.keys():
        try:
            return {
                "COMMA": ",",
                "VERTICAL_BAR": "|",
                "SEMICOLON": ";",
                "TAB": "\t",
            }[block["FIELD_DELIMITER"]]
        except KeyError:
            raise KeyError("Unknown FIELD_DELIMITER character.")
    return ","

count_from_bottom_of_file(fn: Union[str, list, Path], rows: int, row_bytes: int) -> int

Fallback start-byte-finding function for cases in which a label gives the length of a table in terms of number of rows and row length, but does not specify where in the file the table starts. In these cases, the table usually goes to the end of the file, but may be preceded by a header or whatever, which means that we can often guess its start byte by subtracting the table size in bytes from the physical size of the file. This is not guaranteed to work!

Source code in pdr/loaders/_helpers.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def count_from_bottom_of_file(
    fn: Union[str, list, Path], rows: int, row_bytes: int
) -> int:
    """
    Fallback start-byte-finding function for cases in which a label gives
    the length of a table in terms of number of rows and row length, but does
    not specify where in the file the table _starts_. In these cases, the table
    usually goes to the end of the file, but may be preceded by a header or
    whatever, which means that we can often guess its start byte by subtracting
    the table size in bytes from the physical size of the file. This is not
    guaranteed to work!
    """
    tab_size = rows * row_bytes
    if isinstance(fn, list):
        fn = fn[0]
    return os.path.getsize(Path(fn)) - tab_size

looks_like_ascii(block: MultiDict, name: str) -> bool

Is this probably an ASCII table?

Source code in pdr/loaders/_helpers.py
22
23
24
25
26
27
28
def looks_like_ascii(block: MultiDict, name: str) -> bool:
    """Is this probably an ASCII table?"""
    return (
        ("SPREADSHEET" in name)
        or ("ASCII" in name)
        or (block.get("INTERCHANGE_FORMAT") == "ASCII")
    )

quantity_start_byte(quantity_dict: dict[str, Union[str, int]], record_bytes: Optional[int]) -> Optional[int]

Attempt to infer an object's start byte from a dict parsed from a PVL quantity object associated with a PVL pointer parameter, along with, if known, the size of a product's records (relevant only if the quantity units are not bytes). Returns None if we can't infer it (usually meaning that the label gives the start position in records but doesn't say how big the records are).

Source code in pdr/loaders/_helpers.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def quantity_start_byte(
    quantity_dict: dict[str, Union[str, int]], record_bytes: Optional[int]
) -> Optional[int]:
    """
    Attempt to infer an object's start byte from a dict parsed from a PVL
    quantity object associated with a PVL pointer parameter, along with, if
    known, the size of a product's records (relevant only if the quantity
    units are not bytes). Returns None if we can't infer it (usually meaning
    that the label gives the start position in records but doesn't say how
    big the records are).
    """
    # TODO: are there cases in which _these_ aren't 1-indexed?
    if quantity_dict["units"] == "BYTES":
        return quantity_dict["value"] - 1
    if record_bytes is not None:
        return record_bytes * max(quantity_dict["value"] - 1, 0)

loaders.astrowrap

loaders.datawrap

Classes to wrap and manage complex data-loading workflows.

Loader

compact wrapper for loader functions, intended principally but not solely for library-internal use. provides a common interface, adds compactness, delays imports, etc.

Source code in pdr/loaders/datawrap.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
class Loader:
    """
    compact wrapper for loader functions, intended principally but not solely
    for library-internal use. provides a common interface, adds compactness,
    delays imports, etc.
    """

    def __init__(self, loader_function: LoaderFunction):
        self.loader_function = loader_function
        self.argnames = get_argnames(loader_function)

    def __call__(
        self, pdrlike: PDRLike, name: str, **kwargs
    ) -> dict[str, Any]:
        kwargdict = {"data": pdrlike, "name": depointerize(name)} | kwargs
        kwargdict["tracker"].set_metadata(loader=self.__class__.__name__)
        record_exc = {"status": "query_ok"}
        try:
            info = softquery(self.loader_function, self.queries, kwargdict)
        except Exception as exc:
            record_exc = {"status": "query_failed"} | _format_exc_report(exc)
            raise exc
        finally:
            kwargdict["tracker"].track(self.loader_function, **record_exc)
            kwargdict["tracker"].dump()
        load_exc = {"status": "load_ok"}
        try:
            return {name: call_kwargfiltered(self.loader_function, **info)}
        except Exception as exc:
            load_exc = {"status": "load_failed"} | _format_exc_report(exc)
            raise exc
        finally:
            kwargdict["tracker"].track(self.loader_function, **load_exc)
            kwargdict["tracker"].dump()
    queries = DEFAULT_DATA_QUERIES

ReadArray

Bases: Loader

wrapper for read_array

Source code in pdr/loaders/datawrap.py
187
188
189
190
191
192
193
194
195
196
197
198
199
class ReadArray(Loader):
    """wrapper for read_array"""

    def __init__(self):
        from pdr.loaders.table import read_array
        from pdr.loaders.queries import parse_array_structure

        super().__init__(read_array)
        self.queries = DEFAULT_DATA_QUERIES | {
            "fmtdef_dt": specialize(
                parse_array_structure, check_special_structure
            ),
        }

ReadCompressedImage

Bases: Loader

wrapper for handle_compressed_image

Source code in pdr/loaders/datawrap.py
178
179
180
181
182
183
184
class ReadCompressedImage(Loader):
    """wrapper for handle_compressed_image"""

    def __init__(self):
        from pdr.loaders.handlers import handle_compressed_image

        super().__init__(handle_compressed_image)

ReadFits

Bases: Loader

wrapper for handle_fits_file

Source code in pdr/loaders/datawrap.py
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
class ReadFits(Loader):
    """wrapper for handle_fits_file"""


    def __init__(self):
        from pdr.loaders.handlers import handle_fits_file

        # noinspection PyTypeChecker
        super().__init__(handle_fits_file)

    def __call__(self, pdrlike: PDRLike, name: str, **kwargs):
        # slightly hacky but works with how we've done dictionary construction
        return tuple(super().__call__(pdrlike, name, **kwargs).values())[0]

    queries = DEFAULT_DATA_QUERIES | {
        "fn": get_file_mapping,
        'target': get_target,
        "identifiers": get_identifiers,
        'hdulist': get_hdulist,
        "hdu_id": specialize(
            get_fits_start_byte, check_special_fits_start_byte
        ),
        'hdu_id_is_index': constant(False)
    }

ReadHeader

Bases: Loader

wrapper for read_header

Source code in pdr/loaders/datawrap.py
122
123
124
125
126
127
128
129
130
131
132
class ReadHeader(Loader):
    """wrapper for read_header"""

    def __init__(self):
        from pdr.loaders.text import read_header
        from pdr.loaders.queries import table_position

        super().__init__(read_header)
        self.queries = DEFAULT_DATA_QUERIES | {
            "table_props": specialize(table_position, check_special_position)
        }

ReadImage

Bases: Loader

wrapper for read_image

Source code in pdr/loaders/datawrap.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
class ReadImage(Loader):
    """wrapper for read_image"""

    def __init__(self):
        from pdr.formats import check_special_compressed_file_reader
        from pdr.loaders.image import read_image
        from pdr.loaders.queries import (
            base_sample_info,
            im_sample_type,
            check_if_qube,
            get_qube_band_storage_type,
            generic_image_properties,
        )
        super().__init__(
            specialize(read_image, check_special_compressed_file_reader)
        )
        self.queries = DEFAULT_DATA_QUERIES | {
            "base_samp_info": base_sample_info,
            "sample_type": specialize(
                im_sample_type, check_special_sample_type
            ),
            "band_storage_type": specialize(
                get_qube_band_storage_type, check_special_qube_band_storage
            ),
            "gen_props": specialize(generic_image_properties, check_if_qube),
            # just modifies gen_props in place, triggers transform in load step
        }

ReadLabel

Bases: Loader

wrapper for read_label

Source code in pdr/loaders/datawrap.py
144
145
146
147
148
149
class ReadLabel(Loader):
    """wrapper for read_label"""

    def __init__(self):
        from pdr.loaders.text import read_label
        super().__init__(specialize(read_label, check_special_label))

ReadTable

Bases: Loader

wrapper for read_table

Source code in pdr/loaders/datawrap.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
class ReadTable(Loader):
    """wrapper for read_table"""

    def __init__(self):
        from pdr.loaders.queries import table_position, parse_table_structure
        from pdr.loaders.table import read_table

        super().__init__(specialize(read_table, check_special_table_reader))
        self.queries = DEFAULT_DATA_QUERIES | {
            "table_props": specialize(table_position, check_special_position),
            "fmtdef_dt": specialize(
                parse_table_structure, check_special_structure
            ),
        }

ReadText

Bases: Loader

wrapper for read_text

Source code in pdr/loaders/datawrap.py
135
136
137
138
139
140
141
class ReadText(Loader):
    """wrapper for read_text"""

    def __init__(self):
        from pdr.loaders.text import read_text

        super().__init__(read_text)

TBD

Bases: Loader

wrapper for tbd

Source code in pdr/loaders/datawrap.py
202
203
204
205
206
207
208
class TBD(Loader):
    """wrapper for tbd"""

    def __init__(self):
        from pdr.loaders.utility import tbd

        super().__init__(tbd)

Trivial

Bases: Loader

wrapper for trivial

Source code in pdr/loaders/datawrap.py
211
212
213
214
215
216
217
class Trivial(Loader):
    """wrapper for trivial"""

    def __init__(self):
        from pdr.loaders.utility import trivial

        super().__init__(trivial)

_format_exc_report(exc: Exception) -> dict

format an exception report for inclusion in another dict

Source code in pdr/loaders/datawrap.py
30
31
32
33
34
35
36
37
def _format_exc_report(exc: Exception) -> dict:
    """format an exception report for inclusion in another dict"""
    report = exc_report(exc)
    for k, v in tuple(report.items()):
        if k != 'exception':
            del report[k]
            report[f"exception_{k}"] = v
    return report

loaders.dispatch

Functions to select appropriate Loader subclasses for data objects.

OBJECTS_TO_IGNORE = ('DATA_SET_MAP_PROJECT.*', '.*_DESC$', '.*DESCRIPTION(_[0-9]*)?$') module-attribute

PDS3 objects we do not automatically load, even when loading greedily. These are reference files, usually throwaway ones, that are usually not archived in the same place as the data products and add little, if any, context to individual products (they are the same across an entire 'product type'). This means that in almost all cases, attempting to greedily load them has no purpose but to throw irrelevant warnings at the user.

file_extension_to_loader(fn: str) -> Loader

Attempt to select the correct Loader subclass for an object based solely on its file extension. Used primarily for objects only specified by a PDS3 FILE_NAME pointer or similar.

Source code in pdr/loaders/dispatch.py
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
def file_extension_to_loader(fn: str) -> Loader:
    """
    Attempt to select the correct Loader subclass for an object based solely on
    its file extension. Used primarily for objects only specified by a PDS3
    FILE_NAME pointer or similar.
    """
    if looks_like_this_kind_of_file(fn, FITS_EXTENSIONS):
        return ReadFits()
    if looks_like_this_kind_of_file(fn, IMAGE_EXTENSIONS):
        return ReadImage()
    if looks_like_this_kind_of_file(fn, TEXT_EXTENSIONS):
        return ReadText()
    if looks_like_this_kind_of_file(fn, TABLE_EXTENSIONS):
        return ReadTable()
    if looks_like_this_kind_of_file(fn, DESKTOP_IMAGE_EXTENSIONS):
        return ReadCompressedImage()
    return TBD()

image_lib_dispatch(pointer: str, data: Data) -> Optional[Loader]

check file extensions to see if we want to toss a file to an external library rather than using our internal raster handling. current cases are: pillow for tiff, gif, or jp2; astropy for fits

Source code in pdr/loaders/dispatch.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
def image_lib_dispatch(pointer: str, data: Data) -> Optional[Loader]:
    """
    check file extensions to see if we want to toss a file to an external
    library rather than using our internal raster handling. current cases are:
    pillow for tiff, gif, or jp2; astropy for fits
    """
    object_filename = data._target_path(pointer)
    if looks_like_this_kind_of_file(object_filename, FITS_EXTENSIONS):
        return ReadFits()
    if looks_like_this_kind_of_file(
        object_filename, DESKTOP_IMAGE_EXTENSIONS
    ):
        return ReadCompressedImage()
    return None

pointer_to_loader(pointer: str, data: Data) -> Loader

Attempt to select an appropriate Loader subclass based on a PDS3 object name (and sometimes the file extension).

The apparently-redundant sequence of conditionals is not in fact redundant; it is based on our knowledge of the most frequently used but sometimes redundant object names in the PDS3 corpus.

Source code in pdr/loaders/dispatch.py
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def pointer_to_loader(pointer: str, data: Data) -> Loader:
    """
    Attempt to select an appropriate Loader subclass based on a PDS3 object
    name (and sometimes the file extension).

    The apparently-redundant sequence of conditionals is not in fact redundant;
    it is based on our knowledge of the most frequently used but sometimes
    redundant object names in the PDS3 corpus.
    """
    if check_trivial_case(pointer, data.identifiers, data.filename):
        return Trivial()
    if pointer == "LABEL":
        return ReadLabel()
    if image_lib_dispatch(pointer, data) is not None:
        return image_lib_dispatch(pointer, data)
    if special_pointer_dispatch(pointer, data.identifiers) is not None:
        return special_pointer_dispatch(pointer, data.identifiers)
    if (
        "TEXT" in pointer
        or "PDF" in pointer
        or "MAP_PROJECTION_CATALOG" in pointer
        or "FILENAME" in pointer
    ):
        return ReadText()
    if "DESC" in pointer:  # probably points to a reference file
        return ReadText()
    if "ARRAY" in pointer:
        return ReadArray()
    table_words = (
        "TABLE", "SPREADSHEET", "CONTAINER", "SERIES", "SPECTRUM", "HISTOGRAM",
        "FILE"
    )
    if (
        any(val in pointer for val in table_words)
        and not any(val+"_HEADER" in pointer for val in table_words)
        and not any(val + "_NAME" in pointer for val in table_words)
        and "HISTOGRAM_IMAGE" not in pointer
    ):
        return ReadTable()
    if "HEADER" in pointer:
        if looks_like_this_kind_of_file(
            data.file_mapping[pointer], FITS_EXTENSIONS
        ):
            return ReadFits()
        return ReadHeader()
    # I have moved this below "table" due to the presence of a number of
    # binary tables named things like "Image Time Table". If there are pictures
    # of tables, we will need to do something more sophisticated.
    if (
        ("IMAGE" in pointer)
        or ("QUB" in pointer)
        or ("XDR_DOCUMENT" in pointer)
    ):
        return ReadImage()
    if "FILE_NAME" in pointer:
        return file_extension_to_loader(pointer)
    return TBD()

special_pointer_dispatch(pointer: str, identifiers)

Some pointers are misleadingly named and the wrong loader is selected in pointer_to_loader. To avoid making the pointer_to_loader logic too complex, we check for those special cases here and return the correct loader.

Source code in pdr/loaders/dispatch.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
def special_pointer_dispatch(pointer: str, identifiers):
    """
    Some pointers are misleadingly named and the wrong loader is selected in
    pointer_to_loader. To avoid making the pointer_to_loader logic too complex,
    we check for those special cases here and return the correct loader.
    """
    # there is an ISIS program (kaguyasp2ascii) that will convert this data
    # to an ASCII table, but the label presents it as image data, so that
    # is how we load it.
    if (
            "SLN-L-SP" in identifiers['DATA_SET_ID']
            and "SPECTRUM" in pointer
    ):
        return ReadImage()
    if (
            "SLN-L-SP" in identifiers['DATA_SET_ID']
            and "ANCILLARY_AND_SUPPLEMENT_DATA" in pointer
    ):
        return ReadTable()
    return None

loaders.handlers

Pointy-end functions used by Loaders that primarily work by calling external libraries that provide high-level support for specific file formats, including pillow and astropy.io.fits.

_check_prescaled_desktop(fn: Union[str, Path])

Check whether a desktop-format image -- i.e., one we loaded with pillow -- might need scaling / masking / etc. Currently we treat this as true for JP2 and GeoTIFF and False otherwise. There might be other heuristics.

Source code in pdr/loaders/handlers.py
207
208
209
210
211
212
213
214
215
216
217
218
219
220
def _check_prescaled_desktop(fn: Union[str, Path]):
    """
    Check whether a desktop-format image -- i.e., one we loaded with pillow --
    might need scaling / masking / etc. Currently we treat this as true for
    JP2 and GeoTIFF and False otherwise. There might be other heuristics.
    """
    from pdr.pil_utils import skim_image_data

    meta = skim_image_data(fn)
    if any('GeoKey' in k for k in meta.keys()):
        return False
    if meta['format'] == 'JPEG2000':
        return False
    return True

add_bit_column_info(obj: dict, definition: MultiDict, identifiers: DataIdentifiers) -> dict

Parse the bit column description (if any) from a dict created from a COLUMN PVL object and add that parsed description to obj (most likely that definition plus block info). Used in queries.read_format_block().

Source code in pdr/loaders/handlers.py
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
def add_bit_column_info(
    obj: dict,
    definition: MultiDict,
    identifiers: DataIdentifiers
) -> dict:
    """
    Parse the bit column description (if any) from a `dict` created from a
    COLUMN PVL object and add that parsed description to `obj` (most likely
    that definition plus block info). Used in `queries.read_format_block()`.
    """
    if "BIT_COLUMN" not in obj.keys():
        return obj
    from pdr.bit_handling import (
        set_bit_string_data_type, get_bit_start_and_size
    )

    from pdr.formats import check_special_bit_format
    is_special, special_obj = check_special_bit_format(
        obj, definition, identifiers
    )
    if is_special:
        obj = special_obj

    obj["DATA_TYPE"] = obj["DATA_TYPE"].replace(" ", "_")
    if "BIT_STRING" not in obj["DATA_TYPE"]:
        obj = set_bit_string_data_type(obj, identifiers)
    return get_bit_start_and_size(obj, definition, identifiers)

handle_compressed_image(fn: Union[str, Path], frame: Optional[int] = None) -> np.ndarray

Open an image in a standard 'desktop' format (GIF, standard TIFF, GeoTIFF, classic JPEG, JPEG2000, PNG, etc.) using pillow. "Compressed" is slightly misleading, because this will work fine on uncompressed GeoTIFF etc.

Source code in pdr/loaders/handlers.py
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
def handle_compressed_image(
    fn: Union[str, Path], frame: Optional[int] = None
) -> np.ndarray:
    """
    Open an image in a standard 'desktop' format (GIF, standard TIFF, GeoTIFF,
    classic JPEG, JPEG2000, PNG, etc.) using pillow. "Compressed" is slightly
    misleading, because this will work fine on uncompressed GeoTIFF etc.
    """
    import numpy as np
    from PIL import Image

    # deactivate pillow's DecompressionBombError: many planetary images
    # are legitimately very large
    Image.MAX_IMAGE_PIXELS = None
    im = Image.open(fn)
    if frame is not None:
        im.seek(frame)
    if im.mode == 'P':
        # images with imbedded palettes (usually GIFs)
        im = im.convert('RGB', palette=im.palette)
    # noinspection PyTypeChecker
    image = np.ascontiguousarray(im).copy()
    # pillow reads images as [x, y, channel] rather than [channel, x, y]
    if len(image.shape) == 3:
        return np.ascontiguousarray(np.rollaxis(image, 2))
    return image

handle_fits_file(fn: str, name: str, hdu_id: Union[str, int, tuple[int, int]], hdulist: Optional[HDUList] = None, hdu_id_is_index: bool = False) -> dict[str, Union[MultiDict, pd.DataFrame, np.ndarray]]

Create an object or objects from an HDU of a FITS file using astropy.io.fits.

hdu_id may be the index of an HDU or the start byte of the HDU's header or data section; hdu_id_is_index=True means that it's the HDU's index. If it's a start byte, and it's the start byte of the HDU's header section, return just the header; otherwise return the data and the header. If it's an index, always return the data and the header (currently this is only used for primary FITS files, which by construction never have headers labeled as independent objects).

Source code in pdr/loaders/handlers.py
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def handle_fits_file(
    fn: str,
    name: str,
    hdu_id: Union[str, int, tuple[int, int]],
    hdulist: Optional[HDUList] = None,
    hdu_id_is_index: bool = False,
) -> dict[str, Union[MultiDict, pd.DataFrame, np.ndarray]]:
    """
    Create an object or objects from an HDU of a FITS file using
    `astropy.io.fits`.

    `hdu_id` may be the index of an HDU or the start byte of the HDU's header
    or data section; `hdu_id_is_index=True` means that it's the HDU's index.
    If it's a start byte, and it's the start byte of the HDU's header section,
    return just the header; otherwise return the data and the header. If it's
    an index, always return the data and the header (currently this is only
    used for primary FITS files, which by construction never have headers
    labeled as independent objects).
    """
    from astropy.io import fits

    if hdulist is None:
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", module="astropy.io.fits.card")
            hdulist = fits.open(fn)
    if hdu_id_is_index is False:
        objrec = hdu_byte_index(hdulist)[hdu_id]
        hdu_ix, is_header = objrec['ix'], objrec['part'] == 'header'
    else:
        # this is the case when dealing with a FITS file in 'primary' mode.
        # This is a little sloppy, but it is more convenient to handle certain
        # things upstream. may want to collapse the two cases at some point.
        hdu_ix = hdu_id
        is_header = (
            "HEADER" in name
            # cases where HDUs are named things like "IMAGE HEADER"
            and name not in [h[1] for h in hdulist.info(False)]
        )
    try:
        hdr_val = handle_fits_header(hdulist, hdu_ix)
    # astropy.io.fits does not call any verification on read. on 'output'
    # tasks -- which iterating over header cards (sometimes) counts as, and
    # which we have to do in order to place the header content into our
    # preferred data structure -- it does call verification at the strictest
    # settings, resulting in delayed exceptions. However, we do not want to
    # automatically run every fix, because astropy's fixes can be quite slow
    # on large, complicated headers. So, if and when astropy decides something
    # is too invalid to show us, tell it to fix it.
    except fits.VerifyError:
        try:
            hdulist.verify('silentfix')
            hdr_val = handle_fits_header(hdulist, hdu_ix)
        except (fits.VerifyError, ValueError):  # real messed up
            hdr_val = handle_fits_header(hdulist, hdu_ix, skip_bad_cards=True)
    if is_header is True:
        return {name: hdr_val}
    output, hdu = {f"{name}_HEADER": hdr_val}, hdulist[hdu_ix]
    # binary table HDUs with repeated column names break astropy -- it will not
    # actually afford the data unless we manipulate it first.
    if isinstance(hdu, fits.BinTableHDU):
        reindex_dupe_names(hdu)
    body = hdu.data
    if body is None:
        # This case is typically a 'stub' PRIMARY HDU. For type consistency,
        # we prefer to return an empty array rather than None.
        import numpy as np

        body = np.array([])
    elif isinstance(body, fits.fitsrec.FITS_rec):
        # This case is a FITS table, binary or ASCII. For type consistency, we
        # want to return a pandas DataFrame, not a FITS_rec.
        body = _convert_fits_table_to_df(body, hdu)
    return output | {name: body}

handle_fits_header(hdulist: HDUList, hdu_ix: int, skip_bad_cards: bool = False) -> MultiDict

Load the header of a specified HDU as a MultiDict, engaging in various sorts of gymnastics to stymie the attempts of astropy.io.fits to keep us safe from illegally-formatted headers.

Source code in pdr/loaders/handlers.py
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
def handle_fits_header(
    hdulist: HDUList,
    hdu_ix: int,
    skip_bad_cards: bool = False
) -> MultiDict:
    """
    Load the header of a specified HDU as a MultiDict, engaging in various
    sorts of gymnastics to stymie the attempts of astropy.io.fits to keep us
    safe from illegally-formatted headers.
    """
    astro_hdr, output_hdr = hdulist[hdu_ix].header, MultiDict()

    from astropy.io import fits
    for i in range(len(astro_hdr.cards)):
        try:
            key, val, com = astro_hdr.cards[i]
            if len(key) == 0:
                # placeholder card records
                continue
            if isinstance(val, (str, float, int)):
                output_hdr.add(key, val)
            # We do not want to represent keyword-only cards with weird
            # special astropy objects.
            elif val.__class__.__name__ == 'Undefined':
                output_hdr.add(key, None)
            else:
                output_hdr.add(key, str(val))
            if len(com) > 0:
                comment_key = key + "_comment"
                output_hdr.add(comment_key, com)
        except fits.VerifyError:
            if skip_bad_cards is True:
                continue
            raise
        except StopIteration:
            break
    return output_hdr

hdu_byte_index(obj: Union[str, Path, HDUList]) -> dict

produce a dict describing the locations of HDUs and their headers within a FITS file.

Source code in pdr/loaders/handlers.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def hdu_byte_index(obj: Union[str, Path, HDUList]) -> dict:
    """
    produce a dict describing the locations of HDUs and their headers within
    a FITS file.
    """
    from astropy.io import fits

    info = {}
    hdul = obj if isinstance(obj, fits.HDUList) else fits.open(obj)
    hdulinfo = hdul.info(False)
    for hdu_ix, hdu in enumerate(hdul):
        hinfo = hdu.fileinfo()
        baserec = {'ix': hdu_ix, 'name': hdulinfo[hdu_ix][3]}
        info[hinfo['hdrLoc']] = baserec | {'part': 'header'}
        info[hinfo['datLoc']] = baserec | {'part': 'data'}
    return info

reindex_dupe_names(hdu: BinTableHDU)

Astropy cannot construct the .data attribute of a BinTableHDU if the table has duplicate column names. This changes any duplicate column names in place following the same convention we use for PDS binary tables (appending incrementing integers).

Source code in pdr/loaders/handlers.py
164
165
166
167
168
169
170
171
172
173
174
175
176
def reindex_dupe_names(hdu: BinTableHDU):
    """
    Astropy cannot construct the .data attribute of a BinTableHDU if the table
    has duplicate column names. This changes any duplicate column names in
    place following the same convention we use for PDS binary tables (appending
    incrementing integers).
    """
    names = [c.name for c in hdu.columns]
    repeats = {n for n in names if names.count(n) > 1}
    for r in repeats:
        indices = [ix for ix, n in enumerate(names) if n == r]
        for i, ix in enumerate(indices):
            hdu.columns[ix].name += f"_{i}"

unpack_fits_headers(filename: Union[str, Path], hdulist: Optional[HDUList] = None) -> tuple[MultiDict, list[str], dict[str, int]]

Unpack all headers in a FITS file into a MultiDict and flattened list of keys suitable for constructing a pdr.Metadata object, along with a mapping between HDU names and indices. Used when opening a FITS file in "primary" mode (i.e., directly from its own headers, without a supporting PDS3 or PDS4 label).

Source code in pdr/loaders/handlers.py
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
def unpack_fits_headers(
    filename: Union[str, Path], hdulist: Optional[HDUList] = None
) -> tuple[MultiDict, list[str], dict[str, int]]:
    """
    Unpack all headers in a FITS file into a MultiDict and flattened list of
    keys suitable for constructing a `pdr.Metadata` object, along with a
    mapping between HDU names and indices. Used when opening a FITS file in
    "primary" mode (i.e., directly from its own headers, without a supporting
    PDS3 or PDS4 label).
    """
    from astropy.io import fits

    hdumap = {}
    headerdict = MultiDict()
    if hdulist is None:
        hdulist = fits.open(filename)
    namegroups = groupby(lambda hi: hi[1], hdulist.info(False))
    for name, group in namegroups.items():
        if len(group) == 1:
            hdu_ix = group[0][0]
            headerdict.add(name, handle_fits_header(hdulist, hdu_ix))
            hdumap[name] = hdu_ix
            continue
        for ix, hdu in enumerate(group):
            hdu_ix, hdu_name = hdu[0], f'{name}_{ix}'
            headerdict.add(hdu_name, handle_fits_header(hdulist, hdu_ix))
            hdumap[hdu_name] = hdu_ix
    params = []
    for hdu_name in headerdict.keys():
        # note that FITS headers aren't nested, so we only have to iterate
        # over one level. How refreshing!
        params.append(hdu_name)
        for field in headerdict[hdu_name].keys():
            params.append(field)
    return headerdict, params, hdumap

loaders.image

Functions for the nitty-gritty array-shaping parts of image loading.

convert_if_vax(image: np.ndarray, props: dict) -> np.ndarray

If an array is in 32-bit VAX real format, convert it to 32-bit float.

Source code in pdr/loaders/image.py
76
77
78
79
80
def convert_if_vax(image: np.ndarray, props: dict) -> np.ndarray:
    """If an array is in 32-bit VAX real format, convert it to 32-bit float."""
    if props.get('is_vax_real') is True:
        return vax.from_vax32(image)
    return image

extract_axplanes(image: np.ndarray, props: ImageProps) -> tuple[np.ndarray, dict[str, np.ndarray]]

extract ISIS-style side/bottom/top/backplanes from an array

Source code in pdr/loaders/image.py
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
def extract_axplanes(
    image: np.ndarray, props: ImageProps
) -> tuple[np.ndarray, dict[str, np.ndarray]]:
    """extract ISIS-style side/bottom/top/backplanes from an array"""
    axplanes = {}
    for side, ax in product(("prefix", "suffix"), ("row", "col", "band")):
        # noinspection PyTypedDict
        if (count := props.get(f"{side}_{ax}s")) is None:
            continue
        axn, axname = {
            "band": (0, "BAND"),
            "row": (1, "LINE"),
            "col": (2, "SAMPLE"),
        }[ax]
        axn = axn - 1 if len(image.shape) == 2 else axn
        aslice, pslice = [], []
        for i in range(len(image.shape)):
            if i != axn:
                aslice.append(slice(None, None, None))
                pslice.append(slice(None, None, None))
            elif side == "prefix":
                aslice.append(slice(count, None))
                pslice.append(slice(None, count))
            else:
                aslice.append(slice(None, -count))
                pslice.append(slice(-count, None))
        axplanes[f"{side}_{ax}s"] = image[tuple(pslice)]
        image = image[tuple(aslice)]
    return image, axplanes

extract_bil_linefix(image: np.ndarray, props: ImageProps) -> tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]

If they exist, extract line prefixes and/or suffixes from a raveled BIL (LINE_INTERLEAVED) image. Return the image shorn of pre/suffixes, the prefixes (if any), and the suffixes (if any).

Source code in pdr/loaders/image.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
def extract_bil_linefix(
    image: np.ndarray, props: ImageProps
) -> tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]:
    """
    If they exist, extract line prefixes and/or suffixes from a raveled BIL
    (LINE_INTERLEAVED) image. Return the image shorn of pre/suffixes, the
    prefixes (if any), and the suffixes (if any).
    """
    if props["linepad"] == 0:
        return image, None, None
    prefix, suffix = None, None
    image = image.reshape(props["nrows"], int(image.size / props["nrows"]))
    if props.get("line_suffix_pix") is not None:
        suffix = image[:, -props["line_suffix_pix"] :]
        image = image[:, : -props["line_suffix_pix"]]
    if props.get("line_prefix_pix") is not None:
        prefix = image[:, : props["line_prefix_pix"]]
        image = image[:, props["line_prefix_pix"] :]
    return image, prefix, suffix

extract_single_band_linefix(image: np.ndarray, props: ImageProps) -> tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]

If they exist, extract line prefixes and/or suffixes from a single-band image (i.e., a 2D ndarray). Return the image shorn of pre/suffixes, the prefixes (if any), and the suffixes (if any).

Source code in pdr/loaders/image.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def extract_single_band_linefix(
    image: np.ndarray, props: ImageProps
) -> tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]:
    """
    If they exist, extract line prefixes and/or suffixes from a single-band
    image (i.e., a 2D ndarray). Return the image shorn of pre/suffixes, the
    prefixes (if any), and the suffixes (if any).
    """
    if props["linepad"] == 0:
        return image, None, None
    prefix, suffix = None, None
    image = image.reshape(props["nrows"], props["ncols"] + props["linepad"])
    if props.get("line_suffix_pix", 0) > 0:
        suffix = image[:, -props["line_suffix_pix"] :]
        image = image[:, : -props["line_suffix_pix"]]
    if props.get("line_prefix_pix", 0) > 0:
        prefix = image[:, : props["line_prefix_pix"]]
        image = image[:, props["line_prefix_pix"] :]
    return image, prefix, suffix

make_format_specifications(props: ImageProps) -> tuple[str, np.dtype]

Given an image properties dict, construct a struct format string and a numpy dtype that could be used to interpret the described image using, respectively, struct or numpy.

Source code in pdr/loaders/image.py
42
43
44
45
46
47
48
49
50
51
52
def make_format_specifications(props: ImageProps) -> tuple[str, np.dtype]:
    """
    Given an image properties dict, construct a struct format string and a
    numpy dtype that could be used to interpret the described image using,
    respectively, struct or numpy.
    """
    endian, ctype = props["sample_type"][0], props["sample_type"][-1]
    struct_fmt = f"{endian}{props['pixels']}{ctype}"
    np_type = props["sample_type"][1:]
    dtype = np.dtype(f"{endian}{np_type}")
    return struct_fmt, dtype

process_multiband_image(f: BufferedIOBase, props: ImageProps) -> tuple[np.ndarray, dict[str, np.ndarray], Optional[np.ndarray], Optional[np.ndarray]]

Load the elements of a multiband image from an open file stream, reshape the resulting array as appropriate for the image's band storage type, perform any cleanup / segmentation operations implied by the props dict, and return it, along with any side/bottom/topplanes or line pre/suffixes.

Source code in pdr/loaders/image.py
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
def process_multiband_image(f: BufferedIOBase, props: ImageProps) -> tuple[
    np.ndarray,
    dict[str, np.ndarray],
    Optional[np.ndarray],
    Optional[np.ndarray]
]:
    """
    Load the elements of a multiband image from an open file stream, reshape
    the resulting array as appropriate for the image's band storage type,
    perform any cleanup / segmentation operations implied by the `props` dict,
    and return it, along with any side/bottom/topplanes or line pre/suffixes.
    """
    bst = props["band_storage_type"]
    if bst not in (
        "BAND_SEQUENTIAL", "LINE_INTERLEAVED", "SAMPLE_INTERLEAVED"
    ):
        warnings.warn(
            f"Unsupported BAND_STORAGE_TYPE={bst}. Guessing BAND_SEQUENTIAL."
        )
        bst = "BAND_SEQUENTIAL"
    _, numpy_dtype = make_format_specifications(props)
    image = np_from_buffered_io(f, numpy_dtype, count=props["pixels"])
    image = convert_if_vax(image, props)
    bands, lines, samples = (
        props["nbands"] + props["bandpad"],
        props["nrows"] + props["rowpad"],
        props["ncols"] + props["colpad"],
    )
    prefix, suffix = None, None
    if bst == "BAND_SEQUENTIAL":
        image = image.reshape(bands, lines, samples)
    elif bst == "SAMPLE_INTERLEAVED":
        image = image.reshape(lines, samples, bands)
        image = np.moveaxis(image, 2, 0)
    elif bst == "LINE_INTERLEAVED":
        # NOTE: we haven't implemented linefix extraction for non-BIL images
        # because we haven't yet found any non-BIL multiband images in the PDS
        # with linefixes. queries.check_fix_validity() will throw a
        # NotImplementedError should it ever encounter them, at which point
        # we can implement support for them.
        image, prefix, suffix = extract_bil_linefix(image, props)
        image = image.reshape(lines, bands, samples)
        image = np.moveaxis(image, 0, 1)
    image, axplanes = extract_axplanes(image, props)
    return make_c_contiguous(image), axplanes, prefix, suffix

process_single_band_image(f: BufferedIOBase, props: ImageProps) -> tuple[np.ndarray, dict[str, np.ndarray], Optional[np.ndarray], Optional[np.ndarray]]

Load a single-band image from an open file stream, perform any cleanup / segmentation operations implied by the props dict, and return it, along with any side/bottom/topplanes or line pre/suffixes.

Source code in pdr/loaders/image.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def process_single_band_image(
    f: BufferedIOBase, props: ImageProps
) -> tuple[
    np.ndarray,
    dict[str, np.ndarray],
    Optional[np.ndarray],
    Optional[np.ndarray]
]:
    """
    Load a single-band image from an open file stream,
    perform any cleanup / segmentation operations implied by the `props` dict,
    and return it, along with any side/bottom/topplanes or line pre/suffixes.
    """
    _, numpy_dtype = make_format_specifications(props)
    # TODO: added this 'count' parameter to handle a case in which the image
    #  was not the last object in the file. We might want to add it to
    #  the multiband loaders too.
    image = np_from_buffered_io(f, dtype=numpy_dtype, count=props["pixels"])
    image, prefix, suffix = extract_single_band_linefix(image, props)
    image = convert_if_vax(image, props)
    image = image.reshape(
        (props["nrows"] + props["rowpad"], props["ncols"] + props["colpad"])
    )
    image, axplanes = extract_axplanes(image, props)
    return make_c_contiguous(image), axplanes, prefix, suffix

read_image(name: str, gen_props: ImageProps, fn: str, start_byte: int) -> np.ndarray

Read an IMAGE object and return it as a numpy array.

Source code in pdr/loaders/image.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def read_image(
    name: str, gen_props: ImageProps, fn: str, start_byte: int
) -> np.ndarray:
    """Read an IMAGE object and return it as a numpy array."""
    props = get_image_properties(gen_props)
    f = decompress(fn)  # seamlessly deal with compression
    f.seek(start_byte)
    try:
        # Make sure that single-band images are 2-dim arrays.
        if props["nbands"] == 1:
            image, axplanes, pre, suf = process_single_band_image(f, props)
        else:
            image, axplanes, pre, suf = process_multiband_image(f, props)
    except Exception as ex:
        raise ex
    finally:
        f.close()
    if "PREFIX" in name:
        return pre
    elif "SUFFIX" in name:
        return suf
    return image

loaders.queries

Functions used as part of Loader subclasses' softquery()-backed metadata-processing workflows.

DEFAULT_DATA_QUERIES = MappingProxyType({'identifiers': get_identifiers, 'block': specialize(get_block, check_special_block), 'fn': get_file_mapping, 'target': get_target, 'start_byte': specialize(data_start_byte, check_special_offset), 'debug': get_debug, 'return_default': get_return_default}) module-attribute

Queries common to most Loaders.

START_BYTE_QUERIES = MappingProxyType({'identifiers': get_identifiers, 'block': specialize(get_block, check_special_block), 'fn': get_file_mapping, 'target': get_target, 'start_byte': specialize(data_start_byte, check_special_offset)}) module-attribute

Queries for simply finding an object's start byte and containing file. Used for the standalone_start_byte() 'a la carte' function below, designed to support implicit object association.

_extract_table_records(block)

Attempt to get the number of 'records', which can mean either row count or records defined by byte length in a way that does not necessarily correspond to number of rows, from a TABLE/SPREADSHEET definition.

Source code in pdr/loaders/queries.py
428
429
430
431
432
433
434
435
436
437
438
def _extract_table_records(block):
    """
    Attempt to get the number of 'records', which can mean either row count
    or records defined by byte length in a way that does not necessarily
    correspond to number of rows, from a TABLE/SPREADSHEET definition.
    """
    if "RECORDS" in block.keys():
        return block["RECORDS"]
    elif "ROWS" in block.keys():
        return block["ROWS"]
    return None

_fill_empty_byte_rows(fmtdef: pd.DataFrame) -> pd.DataFrame

Fill any missing byte rows in a format definition. This is typically used to fill

Source code in pdr/loaders/queries.py
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
def _fill_empty_byte_rows(fmtdef: pd.DataFrame) -> pd.DataFrame:
    """
    Fill any missing byte rows in a format definition. This is typically
    used to fill
    """
    nobytes = fmtdef["BYTES"].isna()
    with warnings.catch_warnings():
        # TODO: although we do not care that .loc will set items inplace later, 
        #  at all, this will hard-fail in pandas 3.x and needs to be changed.
        warnings.simplefilter("ignore", category=FutureWarning)
        fmtdef.loc[nobytes, "BYTES"] = (
            # TODO: I think the subsequent TODO is out of date?
            # TODO, maybe: update with ITEM_OFFSET should we implement that
            fmtdef.loc[nobytes, "ITEMS"]
            * fmtdef.loc[nobytes, "ITEM_BYTES"]
        )
    fmtdef["BYTES"] = fmtdef["BYTES"].astype(int)
    return fmtdef

_fix_up_line_prefix_table_block(data: PDRLike, name: str, parent_block: MultiDict)

Deal with assorted quirks of underspecified line prefix table definitions that will stymie the primary table format interpretation workflow.

Source code in pdr/loaders/queries.py
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
def _fix_up_line_prefix_table_block(
    data: PDRLike, name: str, parent_block: MultiDict
):
    """
    Deal with assorted quirks of underspecified line prefix table definitions
    that will stymie the primary table format interpretation workflow.
    """
    # TODO:  we have to unpack nested structure definitions
    #  here in order to get properties like ROWS and ROW_SUFFIX_BYTES
    #  into the block. This is very ugly, and perhaps should be reworked.
    format_block = inject_format_files(
        # TODO: will need to modify this simple check of data.file_mapping if
        #  we ever have a case of this type where the filename must be
        #  specialized. Hopefully that does not happen as it will require
        #  mutating the signature of get_block().
        list(parent_block.items()), name, data.file_mapping[name], data
    )
    block = MultiDict(format_block)
    if "ROWS" not in block.keys():
        block["ROWS"] = block["LINES"]
    if "ROW_SUFFIX_BYTES" not in block.keys():
        block['ROW_SUFFIX_BYTES'] = (
            block['SAMPLE_BITS'] // 8 * block['LINE_SAMPLES']
        )
    return block

_probably_ascii(block: MultiDict, fmtdef: pd.DataFrame, name: str) -> bool

Attempt to determine whether a TABLE is ASCII from its label block and format definition.

Source code in pdr/loaders/queries.py
558
559
560
561
562
563
564
565
566
def _probably_ascii(block: MultiDict, fmtdef: pd.DataFrame, name: str) -> bool:
    """
    Attempt to determine whether a TABLE is ASCII from its label block and
    format definition.
    """
    return (
        fmtdef["DATA_TYPE"].str.contains("ASCII").any()
        or looks_like_ascii(block, name)
    )

_table_length(block, identifiers, n_records)

Source code in pdr/loaders/queries.py
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
def _table_length(block, identifiers, n_records):
    """"""
    length = None
    try:
        if "BYTES" in block.keys():
            length = block["BYTES"]
        elif n_records is not None:
            if "RECORD_BYTES" in block.keys():
                record_length = block["RECORD_BYTES"]
            elif "ROW_BYTES" in block.keys():
                record_length = block["ROW_BYTES"]
                record_length += block.get("ROW_SUFFIX_BYTES", 0)
            elif identifiers["RECORD_BYTES"] is not None:
                # TODO, probably, and applicable many more places than here:
                #  ideally we don't use identifiers for anything but special
                #  case checks.
                record_length = identifiers["RECORD_BYTES"]
            else:
                record_length = None
            if record_length is not None:
                length = record_length * n_records
    except AttributeError:
        pass
    return length

_table_row_position(n_records, target: PhysicalTarget) -> tuple[Optional[int], int]

Get physical start row and number of rows for a delimited ASCII table with no explicitly-defined row byte length.

A return value of None for length implies that the table occupies the entirety of the file including and after start.

Source code in pdr/loaders/queries.py
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
def _table_row_position(
    n_records, target: PhysicalTarget
) -> tuple[Optional[int], int]:
    """
    Get physical start row and number of rows for a delimited ASCII table with
    no explicitly-defined row byte length.

    A return value of None for `length` implies that the table occupies the
    entirety of the file including and after `start`.
    """
    if isinstance(target[1], dict):
        # noinspection PyTypeChecker
        start = target[1]["value"] - 1
    else:
        try:
            start = target[1] - 1
        except TypeError:
            # You cannot subtract an integer from a string. If target[1] is a
            # string, it implies that the PhysicalTarget is also a string,
            # meaning that it specifies only a filename, which implies that the
            # table starts at the beginning of the file.
            start = 0
    return n_records, start

base_sample_info(block: MultiDict) -> dict

Determine basic sample-level type info for an image object.

Source code in pdr/loaders/queries.py
211
212
213
214
215
216
def base_sample_info(block: MultiDict) -> dict:
    """Determine basic sample-level type info for an image object."""
    return {
        "BYTES_PER_PIXEL": int(block.get("SAMPLE_BITS", 0) / 8),
        "SAMPLE_TYPE": block.get("SAMPLE_TYPE", ""),
    }

check_array_for_subobject(block: MultiDict) -> bool

Does an ARRAY definition contain a definition for a subobject? If it (illegally) contains more than one, raise a ValueError.

Source code in pdr/loaders/queries.py
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
def check_array_for_subobject(block: MultiDict) -> bool:
    """
    Does an ARRAY definition contain a definition for a subobject?
    If it (illegally) contains more than one, raise a ValueError.
    """
    valid_subobjects = ["ARRAY", "BIT_ELEMENT", "COLLECTION", "ELEMENT"]
    subobj = [sub for sub in valid_subobjects if sub in block]
    if len(subobj) > 1:
        raise ValueError(
            f"ARRAY objects may only have one subobject (this has "
            f"{len(subobj)})"
        )
    if len(subobj) < 1:
        return False
    return True

check_fix_validity(props: ImageProps) -> None

"Integrity checker for 'conventional' line pre/suffix definitions.

Source code in pdr/loaders/queries.py
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
def check_fix_validity(props: ImageProps) -> None:
    """"Integrity checker for 'conventional' line pre/suffix definitions."""
    if (props["linepad"] > 0) and (
        (props["rowpad"] + props["colpad"] + props["bandpad"]) > 0
    ):
        raise NotImplementedError(
            "Objects that contain both 'conventional' line pre/suffixes and "
            "ISIS-style side/back/bottomplanes are not supported."
        )
    if len(gt0f((props["rowpad"], props["colpad"], props["bandpad"]))) > 1:
        raise NotImplementedError(
            "ISIS-style axplanes along multiple axes are not supported."
        )
    if (
        (props["linepad"] > 0)
        and props["band_storage_type"] not in (None, "LINE_INTERLEAVED")
        and props["nbands"] > 1
    ):
        raise NotImplementedError(
            "'Conventional' line pre/suffixes are not supported for non-BIL "
            "multiband images."
        )

check_if_qube(name: str, block: MultiDict, band_storage_type: BandStorageType) -> tuple[bool, Optional[dict]]

If this is a metadata block associated with a qube-type object, parse its properties using the various special rules necessary to read ISIS2 parameters.

Source code in pdr/loaders/queries.py
169
170
171
172
173
174
175
176
177
178
179
180
181
182
def check_if_qube(
    name: str,
    block: MultiDict,
    band_storage_type: BandStorageType
) -> tuple[bool, Optional[dict]]:
    """
    If this is a metadata block associated with a qube-type object, parse its
    properties using the various special rules necessary to read ISIS2
    parameters.
    """
    if "QUBE" in name:  # ISIS2 QUBE format
        return True, generic_qube_properties(block, band_storage_type)
    else:
        return False, None

data_start_byte(identifiers: DataIdentifiers, block: Mapping, target, fn) -> int

Determine the first byte of the data in a file from its pointer.

Source code in pdr/loaders/queries.py
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
def data_start_byte(
    identifiers: DataIdentifiers, block: Mapping, target, fn
) -> int:
    """
    Determine the first byte of the data in a file from its pointer.
    """
    if (block is not None) and ("RECORD_BYTES" in block.keys()):
        record_bytes = block["RECORD_BYTES"]
    else:
        record_bytes = identifiers["RECORD_BYTES"]
    start_byte = None
    if isinstance(target, (list, tuple)):
        target = target[-1]
    if isinstance(target, int):
        if target == 1:
            start_byte = 0
        elif record_bytes not in [None, ""]:
            start_byte = record_bytes * max(target - 1, 0)
        elif "ROWS" in block.keys():
            rows = block["ROWS"]
            row_bytes = block["ROW_BYTES"]
            start_byte = max(0, count_from_bottom_of_file(fn, rows, row_bytes))
    elif isinstance(target, dict):
        start_byte = quantity_start_byte(target, record_bytes)
    elif isinstance(target, str):
        start_byte = 0
    if start_byte is not None:
        if start_byte < 0:
            raise ValueError(f"BUG: start_byte={start_byte} < 0")
        return start_byte
    raise ValueError(f"Unknown data pointer format: {target}")

extract_axplane_metadata(block: MultiDict, props: dict) -> dict

extract metadata for ISIS-style side/back/bottomplanes

Source code in pdr/loaders/queries.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
def extract_axplane_metadata(block: MultiDict, props: dict) -> dict:
    """extract metadata for ISIS-style side/back/bottomplanes"""
    # shorthand relating side/backplane "direction" to row/column axes.
    rowcol = {"SAMPLE": "col", "LINE": "row", "BAND": "band"}
    axplane_metadata = {"rowpad": 0, "colpad": 0, "bandpad": 0}
    for ax, side in product(("BAND", "LINE", "SAMPLE"), ("PREFIX", "SUFFIX")):
        if (itembytes := block.get(f"{ax}_{side}_ITEM_BYTES")) is None:
            continue
        if (itemcount := block.get(f"{side}_ITEMS")) is None:
            raise ValueError(
                f"Specified {ax} {side} item bytes with no specified "
                f"number of items; can't interpret."
            )
        if props.get("axnames") is None:
            raise ValueError(
                f"Specified {ax} {side} items with no specified axis "
                f"order; can't interpret."
            )
        if isinstance(itembytes, Sequence):
            if len(set(itembytes)) > 1:
                raise NotImplementedError(
                    "Variable-length axplanes are not yet supported. Please"
                    "file an issue on github with this product as an example"
                    "so we can use it to test development and provide support."
                )
            fixbytes = sum(itembytes)
        else:
            fixbytes = itemcount[props["axnames"].index(ax)] * itembytes
        fix_pix = fixbytes / props["BYTES_PER_PIXEL"]
        if int(fix_pix) != fix_pix:
            raise NotImplementedError(
                "Pre/suffix itemsize < array itemsize is not supported."
            )
        axplane_metadata[f"{side.lower()}_{rowcol[ax]}s"] = int(fix_pix)
        axplane_metadata[f"{rowcol[ax]}pad"] += int(fix_pix)
    return axplane_metadata

extract_linefix_metadata(block: MultiDict, props: dict) -> dict

extract metadata for line prefix/suffix 'tables'

Source code in pdr/loaders/queries.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
def extract_linefix_metadata(block: MultiDict, props: dict) -> dict:
    """extract metadata for line prefix/suffix 'tables'"""
    linefix_metadata = {"linepad": 0}
    for side in ("PREFIX", "SUFFIX"):
        if (fixbytes := block.get(f"LINE_{side}_BYTES")) in (0, None):
            continue
        fix_pix = fixbytes / props["BYTES_PER_PIXEL"]
        if fix_pix != int(fix_pix):
            raise NotImplementedError(
                "Line pre/suffixes not aligned with array element size are "
                "not supported."
            )
        linefix_metadata[f"line_{side.lower()}_pix"] = int(fix_pix)
        linefix_metadata["linepad"] += int(fix_pix)
    return linefix_metadata

generic_image_properties(block: MultiDict, sample_type: str) -> ImageProps

Construct a dict of image properties later used in the image-loading workflow.

Source code in pdr/loaders/queries.py
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
def generic_image_properties(block: MultiDict, sample_type: str) -> ImageProps:
    """
    Construct a dict of image properties later used in the image-loading
    workflow.
    """
    props = {
        # TODO: BYTES_PER_PIXEL check appears repeated with slight variation
        #  from base_sample_info()
        "BYTES_PER_PIXEL": int(block["SAMPLE_BITS"] / 8),
        "is_vax_real": block.get("SAMPLE_TYPE") == "VAX_REAL",
        "sample_type": sample_type,
        "nrows": block["LINES"],
        "ncols": block["LINE_SAMPLES"],
    }
    if "BANDS" in block:
        props["nbands"] = block["BANDS"]
        props["band_storage_type"] = block.get("BAND_STORAGE_TYPE", None)
        # TODO: assess whether this is always ok
        if props["band_storage_type"] is None and props["nbands"] > 1:
            raise ValueError(
                "Cannot read 3D image with no specified band storage type."
            )
    else:
        props["nbands"] = 1
        props["band_storage_type"] = None
    props |= extract_axplane_metadata(block, props)
    props |= extract_linefix_metadata(block, props)
    # noinspection PyTypeChecker
    return props  # not type-complete, 'pixels' added in get_image_properties()

generic_qube_properties(block: MultiDict, band_storage_type: BandStorageType) -> ImageProps

Parse metadata from an ISIS2-style QUBE definition

Source code in pdr/loaders/queries.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def generic_qube_properties(
    block: MultiDict, band_storage_type: BandStorageType
) -> ImageProps:
    """Parse metadata from an ISIS2-style QUBE definition"""
    props = {}
    use_block = block if "CORE" not in block.keys() else block["CORE"]
    props["BYTES_PER_PIXEL"] = int(use_block["CORE_ITEM_BYTES"])  # / 8)
    # TODO: this should probably have for_numpy set to True
    props["sample_type"] = sample_types(
        use_block["CORE_ITEM_TYPE"], props["BYTES_PER_PIXEL"]
    )
    props["is_vax_real"] = use_block["CORE_ITEM_TYPE"] == "VAX_REAL"
    if "AXIS_NAME" in set(block.keys()).union(use_block.keys()):
        props['axnames'] = block.get("AXIS_NAME")
        if props['axnames'] is None:
            props['axnames'] = use_block.get("AXIS_NAME")
        ax_map = {"LINE": "nrows", "SAMPLE": "ncols", "BAND": "nbands"}
        for ax, count in zip(props["axnames"], use_block["CORE_ITEMS"]):
            props[ax_map[ax]] = count
    else:
        props["nrows"] = use_block["CORE_ITEMS"][2]
        props["ncols"] = use_block["CORE_ITEMS"][0]
    props["band_storage_type"] = band_storage_type
    if props["band_storage_type"] is None:
        if props.get("axnames") is not None:
            # noinspection PyTypeChecker
            # writing keys in last-axis-fastest for clarity. however,
            # ISIS always (?) uses first-axis-fastest, hence `reversed` below.
            props["band_storage_type"] = {
                ("BAND", "LINE", "SAMPLE"): "BAND_SEQUENTIAL",
                ("LINE", "SAMPLE", "BAND"): "SAMPLE_INTERLEAVED",
                ("LINE", "BAND", "SAMPLE"): "LINE_INTERLEAVED",
            }[tuple(reversed(props["axnames"]))]
        else:
            props["band_storage_type"] = "ISIS2_QUBE"
    props |= extract_axplane_metadata(block, props)
    # TODO: unclear whether lower-level linefixes ever appear on qubes
    props |= extract_linefix_metadata(block, props)
    return props  # not type-complete, 'pixels' added in get_image_properties()

get_array_num_items(block: MultiDict) -> int

How many total array elements does an ARRAY definition imply?

Source code in pdr/loaders/queries.py
273
274
275
276
277
278
279
280
def get_array_num_items(block: MultiDict) -> int:
    """How many total array elements does an ARRAY definition imply?"""
    items = block["AXIS_ITEMS"]
    if isinstance(items, int):
        return items
    if isinstance(items, Sequence):
        return reduce(mul, items)
    raise TypeError("can't interpret this item number specification")

get_block(data: PDRLike, name: str) -> Optional[MultiDict]

query wrapper for pdr.Data.metablock_(). also checks for interleaved objects.

Source code in pdr/loaders/queries.py
310
311
312
313
314
315
316
317
318
319
320
def get_block(data: PDRLike, name: str) -> Optional[MultiDict]:
    """
    query wrapper for `pdr.Data.metablock_()`. also checks for interleaved
    objects.
    """
    if name in data._interleaved_objects.keys():
        parent = data.metablock_(data._interleaved_objects[name]['parent'])
        if data._interleaved_objects[name]['type'] == 'line_prefix_table':
            return _fix_up_line_prefix_table_block(data, name, parent)
        return parent
    return data.metablock_(name)

get_debug(data: PDRLike) -> bool

Are we in debug mode?

Source code in pdr/loaders/queries.py
533
534
535
def get_debug(data: PDRLike) -> bool:
    """Are we in debug mode?"""
    return data.debug

get_file_mapping(data: PDRLike, name: str) -> Union[str, Path, list[Union[str, Path]]]

query wrapper for pdr.Data.file_mapping.__getitem__()

Source code in pdr/loaders/queries.py
323
324
325
326
327
def get_file_mapping(
    data: PDRLike, name: str
) -> Union[str, Path, list[Union[str, Path]]]:
    """query wrapper for `pdr.Data.file_mapping.__getitem__()`"""
    return data.file_mapping[name]

get_histogram_fields(block: MultiDict) -> list[dict]

Simplified version of read_format_block() for HISTOGRAM objects, whose format specifications are much terser than TABLE/SPREADSHEET/ARRAY.

Source code in pdr/loaders/queries.py
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
def get_histogram_fields(block: MultiDict) -> list[dict]:
    """
    Simplified version of `read_format_block()` for HISTOGRAM objects, whose
    format specifications are much terser than TABLE/SPREADSHEET/ARRAY.
    """
    # This error could go somewhere else, but at least we catch it early here
    if block.get("INTERCHANGE_FORMAT") == "ASCII":
        raise NotImplementedError(
            "ASCII histograms are not currently supported."
        )
    fields = []
    if (repeats := block.get("ITEMS")) is not None:
        fields = append_repeated_object(dict(block), fields, repeats)
    else:
        fields = [dict(block)]
    return fields

get_identifiers(data) -> dict[str, Any]

Query wrapper for pdr.Data.__getattr__("identifiers")

Source code in pdr/loaders/queries.py
878
879
880
def get_identifiers(data) -> dict[str, Any]:
    """Query wrapper for `pdr.Data.__getattr__("identifiers")`"""
    return data.identifiers

get_image_properties(gen_props: ImageProps) -> ImageProps

Second-step cleaning/formatting function for an image properties dict, typically derived from generic_image_properties(), qube_image_properties(), or a special case.

Source code in pdr/loaders/queries.py
185
186
187
188
189
190
191
192
193
194
195
196
197
198
def get_image_properties(gen_props: ImageProps) -> ImageProps:
    """
    Second-step cleaning/formatting function for an image properties dict,
    typically derived from `generic_image_properties()`,
    `qube_image_properties()`, or a special case.
    """
    props = gen_props  # TODO: what is this variable assignment for?
    check_fix_validity(props)
    props["pixels"] = (
        (props["nrows"] + props["rowpad"])
        * (props["ncols"] + props["colpad"] + props["linepad"])
        * (props["nbands"] + props["bandpad"])
    )
    return props

get_none() -> None

Don't get anything

Source code in pdr/loaders/queries.py
883
884
885
def get_none() -> None:
    """Don't get anything"""
    return None

get_qube_band_storage_type(block: MultiDict) -> Optional[BandStorageType]

Attempt to get band storage type from a QUBE definition.

Source code in pdr/loaders/queries.py
250
251
252
def get_qube_band_storage_type(block: MultiDict) -> Optional[BandStorageType]:
    """Attempt to get band storage type from a QUBE definition."""
    return block.get("BAND_STORAGE_TYPE")

get_return_default(data: PDRLike, name: str) -> MultiDict

Wrapper for data.metaget_ used to return default values for failed loads in non-debug mode.

Source code in pdr/loaders/queries.py
525
526
527
528
529
530
def get_return_default(data: PDRLike, name: str) -> MultiDict:
    """
    Wrapper for `data.metaget_` used to return default values for failed loads
    in non-debug mode.
    """
    return data.metaget_(name)

get_target(data: PDRLike, name: str) -> PhysicalTarget

Attempt to get the 'target' of a PDS3 pointer or other physical data location marker for name. This typically becomes the target argument of data_start_byte() and/or table_position(). Also redirects for interleaved objects.

Source code in pdr/loaders/queries.py
330
331
332
333
334
335
336
337
338
339
340
341
342
def get_target(data: PDRLike, name: str) -> PhysicalTarget:
    """
    Attempt to get the 'target' of a PDS3 pointer or other physical data
    location marker for `name`. This typically becomes the `target` argument
    of `data_start_byte()` and/or `table_position()`. Also redirects for
    interleaved objects.
    """
    if name in data._interleaved_objects.keys():
        name = data._interleaved_objects[name]['parent']
    target = data.metaget_(name)
    if isinstance(target, Mapping) or target is None:
        target = data.metaget_(pointerize(name))
    return target

gt0f(seq: Collection[Number]) -> tuple[Number]

greater-than-0 filter

Source code in pdr/loaders/queries.py
140
141
142
def gt0f(seq: Collection[Number]) -> tuple[Number]:
    """greater-than-0 filter"""
    return tuple(filter(lambda x: x > 0, seq))

im_sample_type(base_samp_info: dict) -> str

Determine appropriate numpy dtype string for an IMAGE object

Source code in pdr/loaders/queries.py
201
202
203
204
205
206
207
208
def im_sample_type(base_samp_info: dict) -> str:
    """Determine appropriate numpy dtype string for an IMAGE object"""
    if base_samp_info["SAMPLE_TYPE"] != "":
        return sample_types(
            base_samp_info["SAMPLE_TYPE"],
            base_samp_info["BYTES_PER_PIXEL"],
            for_numpy=True,
        )

inject_format_files(block: list[tuple[str, Any]], name: str, fn: str, data: PDRLike) -> list[tuple[str, Any]]

Load format files referenced by a TABLE/SPREADSHEET/CONTAINER/COLLECTION definition (or recursively referenced by a referenced format file), parse them, and insert them into the referencing definition.

Source code in pdr/loaders/queries.py
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
def inject_format_files(
    block: list[tuple[str, Any]],
    name: str,
    fn: str,
    data: PDRLike
) -> list[tuple[str, Any]]:
    """
    Load format files referenced by a TABLE/SPREADSHEET/CONTAINER/COLLECTION
    definition (or recursively referenced by a referenced format file), parse
    them, and insert them into the referencing definition.
    """
    format_fns = {
        ix: kv[1] for ix, kv in enumerate(block) if STRUCTUREPAT.match(kv[0])
    }
    # make sure to insert the structure blocks in the correct order --
    # and remember that keys are not unique, so we have to use the index
    assembled_structure = []
    last_ix = 0
    for ix, format_fn in format_fns.items():
        fmt = list(load_format_file(data, format_fn, name, fn).items())
        # if the block is itself a TABLE, assume that it's intended to unpack
        # into a parent object, like a LINE_PREFIX_TABLE on an image
        if len(fmt) == 1 and "TABLE" in fmt[0][0]:
            fmt = list(fmt[0][1].items())
        assembled_structure += block[last_ix:ix] + fmt
        last_ix = ix + 1
    assembled_structure += block[last_ix:]
    return assembled_structure

load_format_file(data: PDRLike, format_file: str, name: str, fn: str) -> MultiDict

Attempt to find and read a PVL format file (usually referenced by ^STRUCTURE pointers in an object definition). Normal PVL-reading workflows (including just pdr.read()) work fine on these files, but this function includes additional code to attempt to find the format file.

Source code in pdr/loaders/queries.py
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
def load_format_file(
    data: PDRLike,
    format_file: str,
    name: str,
    fn: str
) -> MultiDict:
    """
    Attempt to find and read a PVL format file (usually referenced by
    ^STRUCTURE pointers in an object definition). Normal PVL-reading workflows
    (including just `pdr.read()`) work fine on these files, but this function
    includes additional code to attempt to _find_ the format file.
    """
    label_fns = data.get_absolute_paths(format_file)
    try:
        repo_paths = [
            Path(find_repository_root(Path(fn)), label_path)
            for label_path in ("label", "LABEL")
        ]
        label_fns += [Path(path, format_file) for path in repo_paths]
    except (ValueError, IndexError):
        pass
    try:
        return read_pvl(check_cases(label_fns))[0]
    except FileNotFoundError:
        warnings.warn(
            f"Unable to locate external table format file:\n\t {format_file}. "
            f"Try retrieving this file and placing it in the same path as the "
            f"{name} file."
        )
        raise FileNotFoundError

parse_array_structure(name: str, block: MultiDict, fn: str, data: PDRLike, identifiers: DataIdentifiers) -> tuple[Optional[pd.DataFrame], Optional[Union[str, np.dtype]]]

parse_table_structure() modified for the special needs of ARRAYs.

Source code in pdr/loaders/queries.py
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
def parse_array_structure(
    name: str,
    block: MultiDict,
    fn: str,
    data: PDRLike,
    identifiers: DataIdentifiers
) -> tuple[Optional[pd.DataFrame], Optional[Union[str, np.dtype]]]:
    """`parse_table_structure()` modified for the special needs of ARRAYs."""
    if not block.get("INTERCHANGE_FORMAT") == "BINARY":
        return None, None
    has_sub = check_array_for_subobject(block)
    if not has_sub:
        dt = sample_types(block["DATA_TYPE"], block["BYTES"], True)
        return None, dt
    fmtdef = read_table_structure(block, name, fn, data, identifiers)
    # Sometimes ARRAYS give START_BYTE at top level; sometimes only their
    # elements do. We want to defer responsibility for figuring this out to
    # compute_offsets(). This satisfies its type expectations.
    if "START_BYTE" in fmtdef.columns:
        fmtdef['START_BYTE'].fillna(1, inplace=True)

    from pdr.pd_utils import insert_sample_types_into_df
    return insert_sample_types_into_df(fmtdef, identifiers)

parse_table_structure(name: str, block: MultiDict, fn: str, data: PDRLike, identifiers: DataIdentifiers) -> tuple[pd.DataFrame, Optional[np.dtype]]

Parse a TABLE or SPREADSHEET's format specification as a pd.DataFrame (see read_table_structure(). If that specification contains byte-position information for columns, further parse them into explicit offsets. If the table is binary, also create a numpy dtype object (usually a compound dtype). These typically become inputs for np.fromfile (for binary tables) or for one of several ASCII parsers.

Source code in pdr/loaders/queries.py
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
def parse_table_structure(
    name: str,
    block: MultiDict,
    fn: str,
    data: PDRLike,
    identifiers: DataIdentifiers
) -> tuple[pd.DataFrame, Optional[np.dtype]]:
    """
    Parse a TABLE or SPREADSHEET's format specification as a pd.DataFrame
    (see `read_table_structure()`. If that specification contains byte-position
    information for columns, further parse them into explicit offsets. If the
    table is binary, also create a numpy dtype object (usually a compound
    dtype). These typically become inputs for np.fromfile (for binary tables)
    or for one of several ASCII parsers.
    """
    fmtdef = read_table_structure(block, name, fn, data, identifiers)
    if "DATA_TYPE" in fmtdef.columns and "BYTES" not in fmtdef.columns:
        if _probably_ascii(block, fmtdef, name):
            # this is either a nonstandard fixed-width table or a DSV table.
            # don't bother trying to calculate explicit byte offsets.
            return fmtdef, None
        fmtdef["BYTES"] = float('nan')
    if fmtdef['BYTES'].isna().any():
        try:
            fmtdef = _fill_empty_byte_rows(fmtdef)
        except (KeyError, TypeError, IndexError):
            raise ValueError("This table's byte sizes are underspecified.")
    for end in ("_PREFIX", "_SUFFIX", ""):
        length = block.get(f"ROW{end}_BYTES")
        if length is not None:
            fmtdef[f"ROW{end}_BYTES"] = length
    from pdr.pd_utils import compute_offsets, insert_sample_types_into_df
    if "START_BYTE" in fmtdef.columns:
        fmtdef = compute_offsets(fmtdef)
    if _probably_ascii(block, fmtdef, name):
        # don't attempt to compute numpy dtypes for ASCII tables
        return fmtdef, None
    return insert_sample_types_into_df(fmtdef, identifiers)

read_format_block(block: MultiDict, object_name: str, fn: str, data: PDRLike, identifiers: DataIdentifiers, within_container: bool = False) -> tuple[list[dict], bool]

Parse a TABLE, ARRAY, SPREADSHEET, CONTAINER, or COLLECTION definition, recursing into ARRAY, CONTAINER, or COLLECTION subcomponents of that definition and loading external STRUCTURE specifications as needed.

This function's fields return value becomes the rows of the fmtdef object used extensively in the table/array-reading workflow.

Source code in pdr/loaders/queries.py
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
def read_format_block(
    block: MultiDict,
    object_name: str,
    fn: str,
    data: PDRLike,
    identifiers: DataIdentifiers,
    within_container: bool = False
) -> tuple[list[dict], bool]:
    """
    Parse a TABLE, ARRAY, SPREADSHEET, CONTAINER, or COLLECTION definition,
    recursing into ARRAY, CONTAINER, or COLLECTION subcomponents of that
    definition and loading external STRUCTURE specifications as needed.

    This function's `fields` return value becomes the rows of the `fmtdef`
    object used extensively in the table/array-reading workflow.
    """
    # load external structure specifications
    format_block = list(block.items())
    # propagate top-level NAME to set offsets correctly for a variety of
    # nesting objects; propagate top-level REPETITIONS and BYTES to set byte
    # offsets correctly in repeating CONTAINERs
    block_info = {
        f"BLOCK_NAME": block.get("NAME"),
        f"BLOCK_REPETITIONS": block.get("REPETITIONS", 1),
        f"BLOCK_BYTES": block.get("BYTES")
    }
    while any(STRUCTUREPAT.match(obj[0]) for obj in format_block):
        format_block = inject_format_files(format_block, object_name, fn, data)
    fields, needs_placeholder, add_placeholder, reps = [], False, False, None
    for item_type, definition in format_block:
        if item_type not in PDS3_STRUCTURED_DATA_PARAMETERS:
            # things formally unrelated to data structure (e.g. physical units)
            continue
        if item_type == "ARRAY":
            if not check_array_for_subobject(definition):
                item_type = "PRIMITIVE_ARRAY"
        if item_type in ("COLUMN", "FIELD", "ELEMENT", "PRIMITIVE_ARRAY"):
            # TODO: this STRUCTUREPAT.match... block smells incredibly bad. Why
            #  is it guarded by the COLUMN/FIELD/ELEMENT/PRIMITIVE_ARRAY
            #  contitional? Why are we scrupulously calling MultiDict.add()
            #  and then immediately casting `definition` back to `dict`,
            #  discarding any duplicate keys we took such care to retain? What
            #  nightmarish class of cases does this catch?
            if "^STRUCTURE" in definition:
                definition_l = inject_format_files(
                    list(definition.items()), object_name, fn, data
                )
                definition = MultiDict()
                for key, val in definition_l:
                    definition.add(key, val)
            obj = dict(definition) | block_info
            # TODO: also smells very bad. Why is this inside a branch that
            #  matches many things that are not ARRAYS (but does not match
            #  every ARRAY)?
            if "BIT_ELEMENT" in obj.keys():
                raise NotImplementedError(
                    "BIT_ELEMENTS in ARRAYS not yet supported"
                )
            reps = definition.get("ITEMS")
            obj = add_bit_column_info(obj, definition, identifiers)
        elif item_type in ("CONTAINER", "COLLECTION", "ARRAY"):
            # This is a somewhat convoluted way to tell the caller to place a
            # PLACEHOLDER pseudo-field between a running sequence of fields
            # and the fields this function returns.
            if within_container is False or len(fields) == 0:
                needs_placeholder = True
            obj, add_placeholder = read_format_block(
                definition, object_name, fn, data, identifiers
            )
            if item_type == "ARRAY":
                add_placeholder = True
            else:
                reps = definition.get("REPETITIONS")
        else:
            # this suggests we made a typo
            raise NotImplementedError(f"No defined behavior for {item_type}.")
        # Format-level data structures like CONTAINERs should not be exposed
        # to the parser; we only care about the things they contain. However,
        # because START_BYTE is always expressed relative to the parent data
        # structure in PDS3 table format specifications, we still need to
        # account for cases in which these structures are separated from
        # preceding elements by pad bytes / whitespace. These PLACEHOLDER
        # pseudo-fields inform downstream code about spacing while also
        # signaling that bytes / characters within that interval should be
        # ignored by the parser.
        if add_placeholder is True:
            fields.append(create_placeholder_field(block_info, definition))
        # CONTAINERs can have REPETITIONS, and some so-called COLUMNS contain
        # a lot of "columns" (ITEMS) as the term is generally used. We express
        # these cases by simply repeating the definition, renaming duplicates.
        if reps is not None:
            fields = append_repeated_object(obj, fields, reps)
        elif isinstance(obj, list):
            fields.extend(obj)
        else:
            fields.append(obj)
    # semi-legal top-level containers not wrapped in other objects
    if object_name == "CONTAINER":
        if (repeat_count := block.get("REPETITIONS")) is not None:
            fields = list(chain(*[fields for _ in range(repeat_count)]))
    return fields, needs_placeholder

read_table_structure(block: MultiDict, name: str, fn: str, data: PDRLike, identifiers: DataIdentifiers) -> pd.DataFrame

Try to turn a TABLE/SPREADSHEET/ARRAY/HISTOGRAM definition into a format definition DataFrame whose rows represent the columns of the defined object and whose columns represent various properties of those columns (data type, byte offset, etc.). Due to the complexity of the PDS3 Standards for these objects, this can include a wide variety of behaviors, including recursively unpacking subobjects, loading external format files, and adding "placeholder" entries for 'padding' (e.g. extra whitespace, separator characters, and row prefixes/suffixes). This is most often called by parse_table_structure() or parse_array_structure(), but some special cases use it on its own.

Source code in pdr/loaders/queries.py
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
def read_table_structure(
    block: MultiDict,
    name: str,
    fn: str,
    data: PDRLike,
    identifiers: DataIdentifiers
) -> pd.DataFrame:
    """
    Try to turn a TABLE/SPREADSHEET/ARRAY/HISTOGRAM definition into a
    format definition DataFrame whose rows represent the columns of the
    defined object and whose columns represent various properties of those
    columns (data type, byte offset, etc.). Due to the complexity of the PDS3
    Standards for these objects, this can include a wide variety of behaviors,
    including recursively unpacking subobjects, loading external format files,
    and adding "placeholder" entries for 'padding' (e.g. extra whitespace,
    separator characters, and row prefixes/suffixes). This is most often
    called by `parse_table_structure()` or `parse_array_structure()`, but some
    special cases use it on its own.
    """
    if "HISTOGRAM" in name:
        fields = get_histogram_fields(block)
    else:
        fields, _ = read_format_block(block, name, fn, data, identifiers)
    import pandas as pd
    from pdr.pd_utils import reindex_df_values

    fmtdef = pd.DataFrame.from_records(fields)
    if "NAME" not in fmtdef.columns:
        fmtdef["NAME"] = name
    # give columns unique names so that none of our table handling explodes
    return reindex_df_values(fmtdef)

table_position(identifiers: DataIdentifiers, block: MultiDict, target: PhysicalTarget, name: str, start_byte: int) -> dict[str, Union[bool, int, None]]

Determine the starting position of a TABLE/SPREADSHEET object from its definition and other previously-determined information.

In the returned dict, if as_rows is True, the table is a delimiter- seperated ASCII table with no explicitly-defined row length, and both "start" and "length" should be interpreted as rows; otherwise, both "start" and "length" should be interpreted as bytes. If length is None, the table occupies the entirety of the file including and after "start".

Source code in pdr/loaders/queries.py
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
def table_position(
    identifiers: DataIdentifiers,
    block: MultiDict,
    target: PhysicalTarget,
    name: str,
    start_byte: int
) -> dict[str, Union[bool, int, None]]:
    """
    Determine the starting position of a TABLE/SPREADSHEET object from its
    definition and other previously-determined information.

    In the returned `dict`, if as_rows is True, the table is a delimiter-
    seperated ASCII table with no explicitly-defined row length, and both
    "start" and "length" should be interpreted as rows; otherwise, both "start"
    and "length" should be interpreted as bytes. If length is None, the table
    occupies the entirety of the file including and after "start".
    """
    if start_byte < 0:
        raise ValueError(f"bad start byte {start_byte}")
    try:
        n_records = _extract_table_records(block)
    except AttributeError:
        n_records = None
    if (as_rows := _check_delimiter_stream(identifiers, name, target, block)):
        length, start = _table_row_position(n_records, target)
    else:
        start = start_byte
        length = _table_length(block, identifiers, n_records)
    if length in (None, "UNK") and "HEADER" in name:
        raise NotImplementedError("header with unknown length")
    return {"start": start, "length": length, "as_rows": as_rows}

loaders.table

Functions for the nitty-gritty byte-juggling parts of TABLE/SPREADSHEET/ARRAY/HISTOGRAM loading.

PAD_CHARACTERS = ' \t",' module-attribute

Characters we want to strip from the beginning/end of every element of an ASCII table.

_interpret_as_ascii(fn: str, fmtdef: pd.DataFrame, block: MultiDict, table_props: dict)

Load text from a file and parse it as an ASCII table.

Source code in pdr/loaders/table.py
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
def _interpret_as_ascii(
    fn: str,
    fmtdef: pd.DataFrame,
    block: MultiDict,
    table_props: dict
):
    """Load text from a file and parse it as an ASCII table."""
    with decompress(fn) as f:
        if table_props["as_rows"] is False:
            bytesbuf = head_file(
                f, nbytes=table_props["length"], offset=table_props["start"]
            )
            try:
                stringbuf = StringIO(bytesbuf.read().decode())
            finally:
                bytesbuf.close()
        else:
            if table_props["start"] > 0:
                [next(f) for _ in range(table_props["start"])]
            if table_props["length"] in (None, ""):
                lines = f.readlines()
            else:
                lines = [next(f) for _ in range(table_props["length"])]
            stringbuf = StringIO("\r\n".join(map(bytes.decode, lines)))
    stringbuf.seek(0)
    return _read_table_from_stringio(fmtdef, block, stringbuf)

_interpret_as_binary(fn, fmtdef, dt, block, start_byte)

Source code in pdr/loaders/table.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
def _interpret_as_binary(fn, fmtdef, dt, block, start_byte):
    """"""
    # TODO: this works poorly (from a usability and performance
    #  perspective; it's perfectly stable) for tables defined as
    #  a single row with tens or hundreds of thousands of columns
    count = block.get("ROWS")
    # TODO: what is this a fallback for? it could produce incorrect
    #  behavior in some cases
    count = count if count is not None else 1
    with decompress(fn) as f:
        table = np_from_buffered_io(
            f, dtype=dt, offset=start_byte, count=count
        )
    table = enforce_order_and_object(table)
    table = pd.DataFrame(table)
    table = convert_ibm_reals(table, fmtdef)
    table = convert_vax_reals(table, fmtdef)
    table.columns = fmtdef.NAME.tolist()
    table = convert_ebcdic(table, fmtdef)
    table = booleanize_booleans(table, fmtdef)
    table = bit_handling.expand_bit_strings(table, fmtdef)
    return table

_read_as_delimited(sep: str, string_buffer: StringIO, fmtdef: pd.DataFrame) -> Optional[pd.DataFrame]

Attempt to read an ASCII table as a delimiter-separated file. We always try this first before moving to a fixed-width parser.

Source code in pdr/loaders/table.py
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
def _read_as_delimited(
    sep: str,
    string_buffer: StringIO,
    fmtdef: pd.DataFrame
) -> Optional[pd.DataFrame]:
    """
    Attempt to read an ASCII table as a delimiter-separated file. We always
    try this first before moving to a fixed-width parser.
    """
    table = pd.read_csv(string_buffer, sep=sep, header=None)
    # TODO: adding this 'PLACEHOLDER' check has allowed many tables to use
    #  read_csv() instead of read_fwf(), which is generally preferable
    #  because read_fwf() is very slow. This may also be able to invalidate
    #  some special cases; should check.
    n_place = len(fmtdef.loc[fmtdef.NAME.str.contains('PLACEHOLDER')])
    if len(table.columns) + n_place != len(fmtdef.NAME.tolist()):
        raise IndexError("Mismatched column length.")
    for c, d in zip(table.columns, table.dtypes):
        # string data is "object" in pandas < 3 and "str" in pandas > 3
        if d.name in ("object", 'str'):
            table[c] = table[c].str.strip(PAD_CHARACTERS)
    return table

_read_fwf_with_colspecs(fmtdef: pd.DataFrame, string_buffer: StringIO) -> pd.DataFrame

Attempt to read an ASCII table as a fixed-width file using column boundaries specified by or inferred from its format definition.

Source code in pdr/loaders/table.py
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
def _read_fwf_with_colspecs(
    fmtdef: pd.DataFrame, string_buffer: StringIO
) -> pd.DataFrame:
    """
    Attempt to read an ASCII table as a fixed-width file using column
    boundaries specified by or inferred from its format definition.
    """
    colspecs = []
    # TODO: this if clause is a 'general special' statement, intended to handle
    #  instances in which special cases call read_table_structure() but do not
    #  pass its results to parse_table_structure() due to some special required
    #  handling. We probably want to change something upstream to avoid this.
    if "SB_OFFSET" not in fmtdef.columns:
        position_records = compute_offsets(fmtdef).to_dict("records")
    else:
        position_records = fmtdef.to_dict("records")
    for record in position_records:
        if np.isnan(record.get("ITEM_BYTES", np.nan)):
            col_length = record["BYTES"]
        else:
            col_length = int(record["ITEM_BYTES"])
        colspecs.append(
            (record["SB_OFFSET"], record["SB_OFFSET"] + col_length)
        )
    # NOTE: the 'delimiter' argument to read_fwf() does _not_ specify
    # an actual delimiter. It defines characters the read_fwf parser
    # will treat as 'padding' and strip from each table element.
    # return read_strictly_fixed(string_buffer, colspecs, PAD_CHARACTERS)
    table = pd.read_fwf(
        string_buffer,
        header=None,
        colspecs=colspecs,
        delimiter=PAD_CHARACTERS
    )
    return table

_read_table_from_stringio(fmtdef: pd.DataFrame, block: MultiDict, string_buffer: StringIO) -> pd.DataFrame

Attempt to parse a string buffer, presumably containing an ASCII table, as a pandas DataFrame. First try to treat it as a delimiter-separated table; fall back to fixed-width parsing if that doesn't work.

Source code in pdr/loaders/table.py
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
def _read_table_from_stringio(
    fmtdef: pd.DataFrame,
    block: MultiDict,
    string_buffer: StringIO
) -> pd.DataFrame:
    """
    Attempt to parse a string buffer, presumably containing an ASCII table, as
    a pandas DataFrame. First try to treat it as a delimiter-separated table;
    fall back to fixed-width parsing if that doesn't work.
    """
    # TODO, maybe: add better delimiter detection & dispatch
    try:
        sep = check_explicit_delimiter(block)
        return _read_as_delimited(sep, string_buffer, fmtdef)
    except (IndexError, UnicodeError, AttributeError, ParserError):
        string_buffer.seek(0)
    if "BYTES" in fmtdef.columns:
        try:
            return _read_fwf_with_colspecs(fmtdef, string_buffer)
        except (pd.errors.EmptyDataError, pd.errors.ParserError):
            string_buffer.seek(0)
    # last-ditch fallback if we don't have column specifications or using the
    # column specifications didn't work. This usually won't work!
    # NOTE: see note in _read_fwf_with_colspecs() on 'delimiter' argument
    return pd.read_fwf(string_buffer, header=None, delimiter=PAD_CHARACTERS)

read_array(fn, block, start_byte, fmtdef_dt)

Read an array object from this product and return it as a numpy array.

Source code in pdr/loaders/table.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def read_array(fn, block, start_byte, fmtdef_dt):
    """
    Read an array object from this product and return it as a numpy array.
    """
    if block.get("INTERCHANGE_FORMAT") == "BINARY":
        _, dt = fmtdef_dt
        count = get_array_num_items(block)
        with decompress(fn) as f:
            array = np_from_buffered_io(
                f,
                dtype=dt,
                count=count,
                offset=start_byte,
            )
        return array.reshape(block["AXIS_ITEMS"])
    # assume objects without the optional interchange_format key are ascii
    with open(fn) as stream:
        text = stream.read()
    try:
        text = tuple(map(float, re.findall(r"[+-]?\d+\.?\d*", text)))
    except (TypeError, IndexError, ValueError):
        text = re.split(r"\s+", text)
    array = np.asarray(text).reshape(block["AXIS_ITEMS"])
    if "DATA_TYPE" in block.keys():
        array = array.astype(
            sample_types(block["DATA_TYPE"], block["BYTES"], True)
        )
    return array

read_table(identifiers, fn, fmtdef_dt, table_props, block, start_byte)

Read a table. Parse the label format definition and then decide whether to treat the table as text or binary.

Source code in pdr/loaders/table.py
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
def read_table(
    identifiers,
    fn,
    fmtdef_dt,
    table_props,
    block,
    start_byte,
):
    """
    Read a table. Parse the label format definition and then decide whether to
    treat the table as text or binary.
    """
    fmtdef, dt = fmtdef_dt
    if dt is None:  # we believe object is an ascii file
        table = _interpret_as_ascii(
            fn, fmtdef, block, table_props
        )
        if len(table.columns) != len(fmtdef):
            table.columns = [
                f for f in fmtdef['NAME'] if not f.startswith('PLACEHOLDER')
        ]
        else:
            table.columns = fmtdef['NAME']
    else:
        table = _interpret_as_binary(fn, fmtdef, dt, block, start_byte)
    table = _drop_placeholders(table)
    # If there is an offset and/or scaling factor, apply them:
    if fmtdef.get("OFFSET") is not None or fmtdef.get("SCALING_FACTOR") is not None:
        for col in table.columns:
            record = fmtdef.loc[fmtdef['NAME'] == col].to_dict("records")[0]
            if record.get("SCALING_FACTOR") and not pd.isnull(record.get("SCALING_FACTOR")):
                table[col] = table[col].mul(record["SCALING_FACTOR"])
            else:
                scaling_factor = 1  # TODO: appears superfluous
            if record.get("OFFSET") and not pd.isnull(record.get("OFFSET")):
                offset = record["OFFSET"]
                table[col] = table[col]+offset
    return table

loaders.text

Pointy-end functions for text-handling Loader subclasses.

ignore_if_pdf(fn: Union[str, Path]) -> Optional[str]

Read text from a file if it's not a pdf.

Source code in pdr/loaders/text.py
131
132
133
134
135
136
137
138
139
@canonicalized
# TODO: misleading name. Primarily a file _reader_.
def ignore_if_pdf(fn: Union[str, Path]) -> Optional[str]:
    """Read text from a file if it's not a pdf."""
    if looks_like_this_kind_of_file(fn, [".pdf"]):
        warnings.warn(f"Cannot open {fn}; PDF files are not supported.")
        return
    # TODO: should use a context manager to avoid dangling file handles
    return open(check_cases(fn)).read()

read_header(fn: Union[str, Path], table_props: dict, name: str = 'HEADER') -> str

Read a text header from a file.

Source code in pdr/loaders/text.py
30
31
32
33
34
35
36
def read_header(
    fn: Union[str, Path],
    table_props: dict,
    name: str = "HEADER"
) -> str:
    """Read a text header from a file."""
    return skeptically_load_header(fn, table_props, name)

read_label(fn: Union[str, Path], fmt: Optional[str] = 'text') -> Union[str, 'PVLModule']

Read the entirety of a PDS3 label, optionally using pvl to parse it as completely as possible into Python objects. This is not intended for use in the primary pdr.Metadata initialization workflow, but rather to handle cases when the user explicitly requests the entirety of the label (typically by accessing the "LABEL" key of a pdr.Data object).

Source code in pdr/loaders/text.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
@canonicalized
def read_label(
    fn: Union[str, Path],
    fmt: Optional[str] = "text"
) -> Union[str, "PVLModule"]:
    """
    Read the entirety of a PDS3 label, optionally using `pvl` to parse it as
    completely as possible into Python objects. This is not intended for use
    in the primary `pdr.Metadata` initialization workflow, but rather to
    handle cases when the user explicitly requests the entirety of the label
    (typically by accessing the "LABEL" key of a `pdr.Data` object).
    """
    if fmt == "text":
        return trim_label(decompress(fn))
    elif fmt == "pvl":
        import pvl

        return pvl.load(fn)
    raise NotImplementedError(f"The {fmt} format is not yet implemented.")

read_text(target: str, fn: Union[list[str], str]) -> Union[list[str], str]

Read text from a file or list of files.

Source code in pdr/loaders/text.py
15
16
17
18
19
20
21
22
23
24
25
26
27
def read_text(target: str, fn: Union[list[str], str]) -> Union[list[str], str]:
    """Read text from a file or list of files."""
    try:
        if isinstance(fn, str):
            return ignore_if_pdf(check_cases(fn))
        elif isinstance(fn, list):
            return [
                ignore_if_pdf(check_cases(each_file))
                for each_file in fn
            ]
    except FileNotFoundError or UnicodeDecodeError:
        warnings.warn(f"couldn't find {target}")
        raise

skeptically_load_header(fn: Union[Path, str], table_props: dict, name: str = 'header', fmt: Optional[str] = 'text') -> Union[str, 'PVLModule', None]

Attempt to read a text HEADER object from a file. PDS3 does not give a strict definition of the HEADER object, so there is no way to consistently load HEADERs in a coherent, well-formatted fashion. However, providers generally use HEADER to denote either attached file/product-level metadata, column headers for an ASCII table, or object-level contextualizing metadata for ASCII tables.

By default, simply read the designated byte range as unicode text. If fmt is "pvl", also attempt to parse this text as PVL. (This will fail on most products, because most HEADER objects are not PVL, but is useful for some ancillary attached labels, especially ISIS labels.)

NOTE: HEADERs defined in labels very often do not actually exist and are never essential for loading primary data objects, so this function is always "optional", even in debug mode. If it fails, it will simply raise a UserWarning and return None.

WARNING: this function is not intended to load metadata of standard file formats (such as TIFF tags or FITS headers). These headers should always be handled by a format-specific parser. More generally, it will never work on binary files.

Source code in pdr/loaders/text.py
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
@canonicalized
def skeptically_load_header(
    fn: Union[Path, str],
    table_props: dict,
    name: str = "header",  # TODO: what's with this default value?
    fmt: Optional[str] = "text",
) -> Union[str, "PVLModule", None]:
    """
    Attempt to read a text HEADER object from a file. PDS3 does not give a
    strict definition of the HEADER object, so there is no way to
    _consistently_ load HEADERs in a coherent, well-formatted fashion. However,
    providers generally use HEADER to denote either attached file/product-level
    metadata, column headers for an ASCII table, or object-level
    contextualizing metadata for ASCII tables.

    By default, simply read the designated byte range as unicode text. If
    `fmt` is "pvl", also attempt to parse this text as PVL. (This will fail
    on most products, because most HEADER objects are not PVL, but is useful
    for some ancillary attached labels, especially ISIS labels.)

    NOTE: HEADERs defined in labels very often do not actually exist and are
    never essential for loading primary data objects, so this function is
    _always_ "optional", even in debug mode. If it fails, it will simply raise
    a UserWarning and return None.

    WARNING: this function is not intended to load metadata of standard file
    formats (such as TIFF tags or FITS headers). These headers should always
    be handled by a format-specific parser. More generally, it will never work
    on binary files.
    """
    # TODO: all these check_cases calls are probably unnecessary w/new file
    #  mapping workflow
    # FIXME: PVL mode ignores the table_props
    # FIXME: Character encoding should be controlled separately from as_rows
    try:
        if fmt == "pvl":
            try:
                from pdr.pvl_utils import cached_pvl_load

                return cached_pvl_load(decompress(check_cases(fn)))
            except ValueError:
                pass
        if table_props["as_rows"] is True:
            # In order to take advantage of Python's universal newline
            # handling, we need to decode the file and _then_ split it.
            # Tolerate encoding errors mainly because we might have a
            # textual header preceded or followed by binary data, and
            # the decoder is going to process more of the file than
            # the part we actually use.
            lines = []
            start = table_props["start"]
            end = start + table_props["length"]
            with decompress(check_cases(fn)) as f:
                decoded_f = TextIOWrapper(f, encoding="UTF-8", errors="replace")
                for i, line in enumerate(decoded_f):
                    if i >= end:
                        break
                    if i >= start:
                        lines.append(line.replace("\n", "\r\n"))
            text = "".join(lines)
        else:
            with decompress(check_cases(fn)) as file:
                file.seek(table_props["start"])
                text = file.read(min(table_props["length"], 80000)).decode(
                    "ISO-8859-1"
                )
        return text
    except (ValueError, OSError) as ex:
        warnings.warn(f"unable to parse {name}: {ex}")

loaders.utility

Support objects for 'utility' Loader subclasses.

is_trivial(pointer: str) -> bool

Returns True if this is the name of a data object we want to handle trivally, in the sense that we never ever want to load it directly.

Source code in pdr/loaders/utility.py
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def is_trivial(pointer: str) -> bool:
    """
    Returns True if this is the name of a data object we want to handle
    trivally, in the sense that we never ever want to load it directly.
    """
    # TIFF tags / headers should always be parsed by the TIFF parser itself
    if (
        ("TIFF" in pointer)
        and ("IMAGE" not in pointer)
        and ("DOCUMENT" not in pointer)
    ):
        return True
    # we don't present STRUCTURES separately from their tables
    if "STRUCTURE" in pointer:
        return True
    # only in MSL CCAM products; probably for internal processing pipelines
    if "PDS_OBJECT" in pointer:
        return True
    return False

looks_like_this_kind_of_file(filename: str, kind_extensions: Collection[str]) -> bool

Does this file have any of these extensions?

Source code in pdr/loaders/utility.py
71
72
73
74
75
76
def looks_like_this_kind_of_file(
    filename: str, kind_extensions: Collection[str]
) -> bool:
    """Does this file have any of these extensions?"""
    is_this_kind_of_extension = partial(contains, kind_extensions)
    return any(map(is_this_kind_of_extension, Path(filename.lower()).suffixes))

tbd(name: str, block: MultiDict, *_, **__)

This is a placeholder function for objects that are not explicitly supported elsewhere. It throws a warning and passes just the value of the pointer.

Source code in pdr/loaders/utility.py
61
62
63
64
65
66
67
68
def tbd(name: str, block: MultiDict, *_, **__):
    """
    This is a placeholder function for objects that are not explicitly
    supported elsewhere. It throws a warning and
    passes just the value of the pointer.
    """
    warnings.warn(f"The {name} pointer is not yet fully supported.")
    return block

trivial(*_, **__)

This is a trivial loader. It does not load. The purpose is to use for any pointers we don't want to load and instead simply want ignored.

Source code in pdr/loaders/utility.py
53
54
55
56
57
58
def trivial(*_, **__):
    """
    This is a trivial loader. It does not load. The purpose is to use
    for any pointers we don't want to load and instead simply want ignored.
    """
    pass

np_utils

Methods for working with numpy objects, primarily intended as components of pdr's image- and table-loading routines.

casting_to_float(array: np.ndarray, *operands: Number) -> bool

check: will this operation cast the array to float? return True if array is integer-valued and any operands are not integers.

Source code in pdr/np_utils.py
56
57
58
59
60
61
62
63
def casting_to_float(array: np.ndarray, *operands: Number) -> bool:
    """
    check: will this operation cast the array to float?
    return True if array is integer-valued and any operands are not integers.
    """
    return (array.dtype.char in np.typecodes["AllInteger"]) and not all(
        [isinstance(operand, int) for operand in operands]
    )

enforce_order_and_object(array: np.ndarray, inplace=True) -> np.ndarray

Make an ndarray compatible for use with pandas or other similarly-strict interfaces. Determine which, if any, of the array's fields are in nonnative byteorder and swap them; also convert any void dtypes to object.

Source code in pdr/np_utils.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def enforce_order_and_object(array: np.ndarray, inplace=True) -> np.ndarray:
    """
    Make an ndarray compatible for use with pandas or other similarly-strict
    interfaces. Determine which, if any, of the array's fields are in nonnative
    byteorder and swap them; also convert any void dtypes to object.
    """
    # NOTE: doing the void conversion in this function is inelegant but
    # somewhat efficient.
    # TODO: or is it? benchmark.
    if inplace is False:
        array = array.copy()
    if len(array.dtype) < 2:
        if len(array.dtype) == 0:
            dtype = array.dtype
            void_return = array
        else:
            dtype = array.dtype[0]
            # if we don't slice the field out explicitly, numpy will transform
            # it into an array of tuples
            void_return = array[tuple(array.dtype.fields.keys())[0]]
        if "V" in str(dtype):
            return void_return.astype("O")
        if dtype.isnative:
            return array
        return array.byteswap().view(array.dtype.newbyteorder("="))
    swap_targets = []
    swapped_dtype = []
    for name, field in array.dtype.fields.items():
        if field[0].isnative is False:
            swap_targets.append(name)
            swapped_dtype.append((name, field[0].newbyteorder("=")))
        elif "V" not in str(field[0]):
            swapped_dtype.append((name, field[0]))
        else:
            swapped_dtype.append((name, "O"))
    # TODO: this may work unreliably for small integer types
    return np.array(array, dtype=swapped_dtype)

ibm32_to_np_f32(ibm)

Convert an array of IBM System 360-style 32-bit floats (expressed as 32-bit unsigned integers) to numpy float64.

Source code in pdr/np_utils.py
121
122
123
124
125
126
def ibm32_to_np_f32(ibm):
    """
    Convert an array of IBM System 360-style 32-bit floats (expressed as 32-bit
    unsigned integers) to numpy float64.
    """
    return ibm_to_np(ibm, 31, 24, 0x00ffffff)

ibm64_to_np_f64(ibm)

Convert an array of IBM System 360-style 64-bit floats (expressed as 64-bit unsigned integers) to numpy float64.

Source code in pdr/np_utils.py
129
130
131
132
133
134
def ibm64_to_np_f64(ibm):
    """
    Convert an array of IBM System 360-style 64-bit floats (expressed as 64-bit
    unsigned integers) to numpy float64.
    """
    return ibm_to_np(ibm, 63, 56, 0x00ffffffffffffff)

ibm_to_np(ibm: np.ndarray, sreg: int, ereg: int, mmask: int) -> np.ndarray

Convert an array composed of IBM System 360-style floats (expressed as 4- or 8-byte unsigned integers, as appropriate for byte width) to numpy float64.

Source code in pdr/np_utils.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def ibm_to_np(ibm: np.ndarray, sreg: int, ereg: int, mmask: int) -> np.ndarray:
    """
    Convert an array composed of IBM System 360-style floats (expressed as
    4- or 8-byte unsigned integers, as appropriate for byte width) to numpy
    float64.
    """
    # dtype conversion: this field must be signed
    ibm_sign = (ibm >> sreg & 0x01).astype('int8')
    # dtype conversion: largest values possible will overfloat int64 or float32
    ibm_exponent = (ibm >> ereg & 0x7f).astype('float64')
    ibm_mantissa = ibm & mmask
    mantissa = ibm_mantissa / (2 ** ereg)
    exponent = 16 ** (ibm_exponent - 64)
    sign = 1 - (2 * ibm_sign).astype('int8')
    return sign * mantissa * exponent

make_c_contiguous(arr: np.ndarray) -> np.ndarray

If an ndarray isn't C-contiguous, reorder it as C-contiguous. If it is, don't mess with it.

Source code in pdr/np_utils.py
 93
 94
 95
 96
 97
 98
 99
100
def make_c_contiguous(arr: np.ndarray) -> np.ndarray:
    """
    If an ndarray isn't C-contiguous, reorder it as C-contiguous. If it is,
    don't mess with it.
    """
    if arr.flags["C_CONTIGUOUS"] is False:
        return np.ascontiguousarray(arr)
    return arr

np_from_buffered_io(buffered_io: BufferedIOBase, dtype: Union[np.dtype, str], offset: Optional[int] = None, count: Optional[int] = None) -> np.ndarray

Read a 1D numpy array of the specified dtype, size, and offset from a buffered IO object.

Source code in pdr/np_utils.py
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def np_from_buffered_io(
    buffered_io: BufferedIOBase,
    dtype: Union[np.dtype, str],
    offset: Optional[int] = None,
    count: Optional[int] = None,
) -> np.ndarray:
    """
    Read a 1D numpy array of the specified dtype, size, and offset from a
    buffered IO object.
    """
    if offset is not None:
        buffered_io.seek(offset)
    if isinstance(buffered_io, (BZ2File, ZipFile, GzipFile, BytesIO)):
        # we need to read the appropriate amount into a new buffer, especially
        # if it's monolithically compressed
        n_bytes = None if count is None else count * dtype.itemsize
        stream = BytesIO(buffered_io.read(n_bytes))
        return np.frombuffer(stream.getbuffer(), dtype=dtype)
    count = -1 if count is None else count
    # In this case, buffered_io is just an open file stream
    return np.fromfile(buffered_io, dtype=dtype, count=count)

parselabel

parselabel.pds3

Parsing utilities for PDS3 labels.

STRUCTUREPAT = re.compile('\\^(?:(?:\\w|_)+_)?STRUCTURE$') module-attribute

regex pattern for format file pointers

BlockParser

Utility class for stateful recursive parsing and aggregation of a series of PVL statements.

Source code in pdr/parselabel/pds3.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
class BlockParser:
    """
    Utility class for stateful recursive parsing and aggregation of a series
    of PVL statements.
    """
    def __init__(self):
        """"""
        self.names, self.aggregations, self.parameters = [], [MultiDict()], []

    def _step_out(self):
        """Exit a block."""
        self.add_statement(self.names.pop(), self.aggregations.pop())

    def _step_in(self, name):
        """Enter a block."""
        self.names.append(name)
        self.aggregations.append(MultiDict())

    def add_statement(self, parameter, value):
        """Add a statement."""
        self.aggregations[-1].add(parameter, value)
        self.parameters.append(parameter)

    def parse_statements(
        self, statements
    ) -> tuple[MultiDict[str, Any], list[str]]:
        """
        Parse a series of PVL statements into a (possibly nested) MultiDict
        and a flattened list of all keys at all levels of that MultiDict.
        """
        for parameter, value in statements:
            if parameter in PVL_BLOCK_INITIALS:
                self._step_in(value)
            elif parameter.startswith("END"):
                # not bothering with aggregation name verification
                if len(self.names) > 0:
                    self._step_out()
                # ignore invalid end block statements at top level
            else:
                self.add_statement(parameter, value)
        if len(self.aggregations) > 1:
            warnings.warn(
                "Leftover aggregations. This may indicate malformatted PVL, "
                "premature label truncation, or the existence of multiple "
                "distinct PVL-texts in the file. If the label is very large, "
                "consider increasing max_size or passing a larger number as "
                "the pvl_limit parameter when initializing the calling "
                "pdr.Data object."
            )
        return self.aggregations[0], self.parameters
__init__()
Source code in pdr/parselabel/pds3.py
87
88
89
def __init__(self):
    """"""
    self.names, self.aggregations, self.parameters = [], [MultiDict()], []
_step_in(name)

Enter a block.

Source code in pdr/parselabel/pds3.py
95
96
97
98
def _step_in(self, name):
    """Enter a block."""
    self.names.append(name)
    self.aggregations.append(MultiDict())
_step_out()

Exit a block.

Source code in pdr/parselabel/pds3.py
91
92
93
def _step_out(self):
    """Exit a block."""
    self.add_statement(self.names.pop(), self.aggregations.pop())
add_statement(parameter, value)

Add a statement.

Source code in pdr/parselabel/pds3.py
100
101
102
103
def add_statement(self, parameter, value):
    """Add a statement."""
    self.aggregations[-1].add(parameter, value)
    self.parameters.append(parameter)
parse_statements(statements) -> tuple[MultiDict[str, Any], list[str]]

Parse a series of PVL statements into a (possibly nested) MultiDict and a flattened list of all keys at all levels of that MultiDict.

Source code in pdr/parselabel/pds3.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
def parse_statements(
    self, statements
) -> tuple[MultiDict[str, Any], list[str]]:
    """
    Parse a series of PVL statements into a (possibly nested) MultiDict
    and a flattened list of all keys at all levels of that MultiDict.
    """
    for parameter, value in statements:
        if parameter in PVL_BLOCK_INITIALS:
            self._step_in(value)
        elif parameter.startswith("END"):
            # not bothering with aggregation name verification
            if len(self.names) > 0:
                self._step_out()
            # ignore invalid end block statements at top level
        else:
            self.add_statement(parameter, value)
    if len(self.aggregations) > 1:
        warnings.warn(
            "Leftover aggregations. This may indicate malformatted PVL, "
            "premature label truncation, or the existence of multiple "
            "distinct PVL-texts in the file. If the label is very large, "
            "consider increasing max_size or passing a larger number as "
            "the pvl_limit parameter when initializing the calling "
            "pdr.Data object."
        )
    return self.aggregations[0], self.parameters

chunk_statements(trimmed_lines: Iterable[str]) -> list[tuple[str, str]]

chunk trimmed lines from a pvl-text into assignment statements.

Source code in pdr/parselabel/pds3.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def chunk_statements(trimmed_lines: Iterable[str]) -> list[tuple[str, str]]:
    """chunk trimmed lines from a pvl-text into assignment statements."""
    statements = []
    for statement in split_before(trimmed_lines, is_an_assignment_line):
        assignment = statement[0]
        if (terminal := extract_pvl_block_terminal(assignment)) is not None:
            statements.append((terminal, ""))
            continue
        try:
            parameter, value_head = map(str.strip, assignment.split("="))
        except ValueError:
            # some people like to put extra '='s on assignment lines,, like:
            # MRO:SPECIMEN_DESC      = "MONTMORILLONITE + FEOX, 100 % FECL2 SOL_N, PH=7,
            # i strongly suspect we will never make semantic use of
            # parameters like this and so we will just ignore them for now
            continue
        value_head += " ".join(map(str.strip, statement[1:]))
        statements.append((parameter, value_head))
    return statements

depointerize(string: str) -> str

prevent a string from starting with ^

Source code in pdr/parselabel/pds3.py
370
371
372
def depointerize(string: str) -> str:
    """prevent a string from starting with ^"""
    return string[1:] if string.startswith("^") else string

extract_pvl_block_terminal(line: str) -> Optional[str]

get the PVL block terminator, if any, from a string

Source code in pdr/parselabel/pds3.py
33
34
35
36
37
38
def extract_pvl_block_terminal(line: str) -> Optional[str]:
    """get the PVL block terminator, if any, from a string"""
    try:
        return re.match(PVL_BLOCK_TERMINAL, line).group()
    except AttributeError:
        return None

get_pds3_pointers(label: Optional[MultiDict] = None) -> tuple[str]

attempt to get all PDS3 "pointers" -- PVL parameters starting with "^" -- from a MultiDict generated from a PDS3 label. These typically specify physical data locations, and in most cases correspond to data object definitions later in the label (common exceptions include "^STRUCTURE"-type pointers and "^DATA_SET_MAP_PROJECTION").

Source code in pdr/parselabel/pds3.py
350
351
352
353
354
355
356
357
358
359
360
361
362
def get_pds3_pointers(
    label: Optional[MultiDict] = None,
) -> tuple[str]:
    """
    attempt to get all PDS3 "pointers" -- PVL parameters starting with "^" --
    from a MultiDict generated from a PDS3 label. These typically specify
    physical data locations, and in most cases correspond to data object
    definitions later in the label (common exceptions include "^STRUCTURE"-type
    pointers and "^DATA_SET_MAP_PROJECTION").
    """
    return dig_for_keys(
        label, lambda k, _: k.startswith("^"), mtypes=(dict, MultiDict)
    )

index_duplicate_pointers(pointers: Collection[str], mapping: MultiDict[str, Any], params: list[str]) -> tuple[MultiDict[str, Any], list[str]]

Although technically illegal, some PDS3 objects have multiple data objects with the same name. This produces counterintuitive results. This function appends ascending integers to any duplicate members of a specified set of "pointer" keys of a MultiDict, and also their "depointerized" versions, in order to distinguish data objects. This can potentially fail if duplicate-named object pointers and their corresponding object definitions are not given in the same order in a label, but we have not yet encountered that case.

Source code in pdr/parselabel/pds3.py
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
def index_duplicate_pointers(
    pointers: Collection[str], mapping: MultiDict[str, Any], params: list[str]
) -> tuple[MultiDict[str, Any], list[str]]:
    """
    Although technically illegal, some PDS3 objects have multiple data objects
    with the same name. This produces counterintuitive results. This function
    appends ascending integers to any duplicate members of a specified set of
    "pointer" keys of a MultiDict, and also their "depointerized" versions,
    in order to distinguish data objects. This _can_ potentially fail if
    duplicate-named object pointers and their corresponding object definitions
    are not given in the same order in a label, but we have not yet
    encountered that case.
    """
    if pointers is None:
        return mapping, params
    # noinspection PyTypeChecker
    pt_groups = groupby(identity, pointers)
    for pointer, group in pt_groups.items():
        if (len(group) > 1) and \
                not any(sub in pointer for sub in
                        ["STRUCTURE", "PDS_OBJECT"]):
            # don't waste anyone's time mentioning, that the label
            # references both ODL.TXT and VICAR2.TXT, etc.
            if "DESCRIPTION" not in pointer:
                depoint = True
                warnings.warn(
                    f"Duplicated {pointer}, indexing with integers after each "
                    f"entry (e.g.: {pointer}_0)"
                )
            else:
                depoint = False
            for ix in range(len(group)):
                indexed_pointer = f"{pointer}_{ix}"
                mapping = multidict_dig_and_edit(
                    input_multidict=mapping,
                    target=pointer,
                    input_object=list(range(len(group))),
                    setter_function=set_key_index,
                    key_editor=True,
                )
                params.append(indexed_pointer)
                params.remove(pointer)
                if depoint:
                    depointer = depointerize(pointer)
                    indexed_depointer = f"{depointer}_{ix}"
                    mapping = multidict_dig_and_edit(
                        input_multidict=mapping,
                        target=depointer,
                        input_object=list(range(len(group))),
                        setter_function=set_key_index,
                        key_editor=True,
                    )
                    params.append(indexed_depointer)
                    if depointer in params:
                        params.remove(depointer)

    return mapping, params

is_an_assignment_line(line: str) -> bool

pick lines that begin assignment statements.

in PDS labels, it never (?) seems to be the case that people use delimiters to put multiple assignment statements on a line

there is an issue with people who put '=' in text blocks -- looking for a block of capital letters is usually good enough

Source code in pdr/parselabel/pds3.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def is_an_assignment_line(line: str) -> bool:
    """
    pick lines that begin assignment statements.

    in PDS labels, it never (?) seems to be the case that people use
    delimiters to put multiple assignment statements on a line

    there is an issue with people who put '=' in text blocks --
    looking for a block of capital letters is usually good enough
    """
    if "=" not in line:
        if extract_pvl_block_terminal(line) is not None:
            return True
        return False
    start = line[:8]
    if start != start.upper():
        return False
    return True

literalize_pvl(obj: Union[str, MultiDict[str, Any]]) -> Union[MultiDict[str, Any], str, int, float, set, tuple]

attempt to interpret string representations of PVL values or aggregations as Python objects. if obj is a MultiDict, attempt to interpret all its values, diving recursively into any contained MultiDicts. permissive; if parsing fails, simply return the string.

Source code in pdr/parselabel/pds3.py
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
def literalize_pvl(
    obj: Union[str, MultiDict[str, Any]]
) -> Union[MultiDict[str, Any], str, int, float, set, tuple]:
    """
    attempt to interpret string representations of PVL values or aggregations
    as Python objects. if `obj` is a MultiDict, attempt to interpret all its
    values, diving recursively into any contained MultiDicts.
    permissive; if parsing fails, simply return the string.
    """
    if isinstance(obj, MultiDict):
        return literalize_pvl_block(obj)
    try:
        # with warnings.catch_warnings(record=True) as w:
        # warnings.simplefilter("always")
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", SyntaxWarning)
            if (not obj.startswith('"')) and ("#" in obj[1:3]):
                return parse_non_base_10(obj)
            return literal_eval(obj)
    except (SyntaxError, ValueError):
        try:
            if ("<" in obj) and (">" in obj):
                return parse_pvl_quantity_statement(obj)
            elif obj[0] in ('(', '{'):
                return parse_unusual_collection(obj)
        except (SyntaxError, ValueError):
            pass
        except IndexError:
            a = 1
    return obj

literalize_pvl_block(block: MultiDict[str, Any]) -> MultiDict[str, Any]

Parse the values of an entire (possibly-nested) MultiDict whose values are PVL strings into Python objects.

Source code in pdr/parselabel/pds3.py
335
336
337
338
339
340
341
342
343
344
345
346
347
def literalize_pvl_block(block: MultiDict[str, Any]) -> MultiDict[str, Any]:
    """
    Parse the values of an entire (possibly-nested) MultiDict whose values are
    PVL strings into Python objects.
    """
    literalized = multidict_dig_and_edit(
        block,
        None,
        predicate=lambda _k, v, _t: not isinstance(v, MultiDict),
        setter_function=lambda _, obj: literalize_pvl(obj),
    )
    # noinspection PyTypeChecker
    return literalized

looks_pvl(filename) -> bool

Is this probably a PVL file?

Source code in pdr/parselabel/pds3.py
134
135
136
def looks_pvl(filename) -> bool:
    """Is this probably a PVL file?"""
    return Path(filename).suffix.lower() in (".lbl", ".fmt")

multidict_dig_and_edit(input_multidict: MultiDict, target: Any = None, input_object: Any = None, predicate: Callable[[Any, Any, Any], bool] = None, setter_function: Callable = None, key_editor: bool = False, keep_values: bool = True, mtypes: tuple[type, ...] = (MultiDict,)) -> MultiDict

This function produces a modified copy of a MultiDict (or other mapping, but may produce unintended results). It searches through a MultiDict's items, recursively continuing into any children that are an instance of mtypes, and checking for keys for which predicate(key, value, target) is True. If predicate is None, the behavior reverts to predicate == key.

If "key_editor" is False, the function changes the values associated with those keys. if it is True, the function changes the key names themselves.

If "setter_function" is not None, it replaces those keys/values with the output of "setter function", executed with "input_object" and the original key/value as arguments. If it is None, it will simply replace them with "input_object".

If "keep_values" is not True, the returned MultiDict will contain only edited values, causing this to also act as a filtering function.

Source code in pdr/parselabel/pds3.py
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
def multidict_dig_and_edit(
    input_multidict: MultiDict,
    target: Any = None,
    input_object: Any = None,
    predicate: Callable[[Any, Any, Any], bool] = None,
    setter_function: Callable = None,
    key_editor: bool = False,
    keep_values: bool = True,
    mtypes: tuple[type, ...] = (MultiDict,)
) -> MultiDict:
    """
    This function produces a modified copy of a MultiDict (or other mapping,
    but may produce unintended results). It searches through
    a MultiDict's items, recursively continuing into any children that are
    an instance of mtypes, and checking for keys for which
    `predicate(key, value, target)` is `True`. If `predicate` is `None`,
    the behavior reverts to `predicate == key`.

    If "key_editor" is False, the function changes the values associated with
    those keys. if it is True, the function changes the key names themselves.

    If "setter_function" is not None, it replaces those keys/values with the
    output of "setter function", executed with "input_object" and the original
    key/value as arguments. If it is None, it will simply replace them with
    "input_object".

    If "keep_values" is not True, the returned MultiDict will contain _only_
    edited values, causing this to also act as a filtering function.
    """
    output_multidict = MultiDict()
    if setter_function is None:
        setter_function = constant(input_object)
    for key, value in input_multidict.items():
        if (is_map := isinstance(value, mtypes)) is True:
            value = multidict_dig_and_edit(
                value,
                target,
                input_object,
                predicate,
                setter_function,
                key_editor,
                keep_values,
                mtypes
            )
        if predicate is None:
            match = key == target
        else:
            match = predicate(key, value, target)
        if match is False:
            if keep_values is True or is_map is True:
                output_multidict.add(key, value)
            continue
        if key_editor is False:
            output_multidict.add(key, setter_function(input_object, value))
        else:
            output_multidict.add(setter_function(input_object, key), value)
    return output_multidict

parse_non_base_10(text: str) -> int

Convert a PVL representation of a non-base-10 integer to a base-10 Python integer.

Source code in pdr/parselabel/pds3.py
266
267
268
269
270
271
272
273
274
275
def parse_non_base_10(text: str) -> int:
    """
    Convert a PVL representation of a non-base-10 integer to a base-10 Python
    integer.
    """
    try:
        base, number = text[:-1].split("#")
        return int(number, int(base))
    except ValueError:
        raise SyntaxError("possible malformatted non-base-10 number")

parse_non_base_10_collection(class_: Union[Type[set], Type[tuple]], obj: str) -> Union[tuple[int], set[int]]

Convert a collection of PVL representations of non-base-10 integers to a collection (of the same class) of base-10 Python integers.

Source code in pdr/parselabel/pds3.py
278
279
280
281
282
283
284
285
286
287
def parse_non_base_10_collection(
    class_: Union[Type[set], Type[tuple]], obj: str
) -> Union[tuple[int], set[int]]:
    """
    Convert a collection of PVL representations of non-base-10 integers to a
    collection (of the same class) of base-10 Python integers.
    """
    return class_(
        map(parse_non_base_10, obj.strip('{}()').replace(" ", '').split(','))
    )

parse_pvl(label: str, deduplicate_pointers: bool = True) -> tuple[MultiDict[str, Any], list[str]]

Parse a PVL-text into a MultiDict and a flattened list of keys.

Source code in pdr/parselabel/pds3.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def parse_pvl(
    label: str, deduplicate_pointers: bool = True
) -> tuple[MultiDict[str, Any], list[str]]:
    """Parse a PVL-text into a MultiDict and a flattened list of keys."""
    uncommented_label = re.sub(r"/\*.*?(\r|\n|/\*)", "\n", label)
    trimmed_lines = filter(
        None, map(lambda line: line.strip(), uncommented_label.split("\n"))
    )
    statements = chunk_statements(trimmed_lines)
    mapping, params = BlockParser().parse_statements(statements)

    if deduplicate_pointers:
        pointers = get_pds3_pointers(mapping)
        mapping, params = index_duplicate_pointers(pointers, mapping, params)
    return literalize_pvl(mapping), params

parse_pvl_quantity_object(obj: str) -> dict[str, Union[str, Number]]

Parse a PVL quantity string into a dict like {'value': 2, 'units': 'km'}.

Source code in pdr/parselabel/pds3.py
173
174
175
176
177
178
179
180
def parse_pvl_quantity_object(obj: str) -> dict[str, Union[str, Number]]:
    """
    Parse a PVL quantity string into a dict like {'value': 2, 'units': 'km'}.
    """
    return {
        "value": literalize_pvl(re.search(PVL_QUANTITY_VALUE, obj).group()),
        "units": literalize_pvl(re.search(PVL_QUANTITY_UNITS, obj).group(1)),
    }

parse_pvl_quantity_statement(statement: str) -> Any

parse pvl statements including quantities. returns quantities as mappings. this will also handle statements that do not consist entirely of quantities, notably including tuples of the form '("A5.DAT", 1000 )' that are commonly used to specify start byte offsets for data objects.

Source code in pdr/parselabel/pds3.py
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
def parse_pvl_quantity_statement(statement: str) -> Any:
    """
    parse pvl statements including quantities. returns quantities as mappings.
    this will also handle statements that do not consist entirely of
    quantities, notably including tuples of the form '("A5.DAT", 1000 <BYTES>)'
    that are commonly used to specify start byte offsets for data objects.
    """
    objects = statement.strip("()").split(",")
    output = []
    for obj in objects:
        # TODO, maybe: a bit redundant
        if ("<" in obj) and (">" in obj):
            try:
                output.append(parse_pvl_quantity_object(obj))
            except AttributeError:
                # not actually-matched brackets
                output.append(obj)
        else:
            output.append(literalize_pvl(obj))
    if len(output) == 1:
        return output[0]
    return tuple(output)

parse_unusual_collection(obj: str) -> Union[tuple[Union[int, str]], set[Union[int, str]]]

Parse a PVL collection of non-base-10 numbers or unquoted strings.

Source code in pdr/parselabel/pds3.py
290
291
292
293
294
295
296
297
298
299
300
def parse_unusual_collection(
    obj: str
) -> Union[tuple[Union[int, str]], set[Union[int, str]]]:
    """Parse a PVL collection of non-base-10 numbers or unquoted strings."""
    class_ = set if obj.startswith('{') else tuple
    if re.match(r'.*\d{1,2}#', obj):
        try:
            return parse_non_base_10_collection(class_, obj)
        except (SyntaxError, ValueError):
            pass
    return class_([s.strip(' ') for s in obj.strip('{}()').split(',')])

pointerize(string: str) -> str

make a string start with ^ if it didn't already

Source code in pdr/parselabel/pds3.py
365
366
367
def pointerize(string: str) -> str:
    """make a string start with ^ if it didn't already"""
    return string if string.startswith("^") else "^" + string

read_pvl(filename: Union[str, Path], deduplicate_pointers: bool = True, max_size: int = DEFAULT_PVL_LIMIT, default_strict_decode: bool = True) -> tuple[MultiDict, list[str]]

Read and parse a file containing a PVL-text.

Source code in pdr/parselabel/pds3.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
def read_pvl(
    filename: Union[str, Path],
    deduplicate_pointers: bool = True,
    max_size: int = DEFAULT_PVL_LIMIT,
    default_strict_decode: bool = True
) -> tuple[MultiDict, list[str]]:
    """Read and parse a file containing a PVL-text."""

    is_special, label = check_special_label(filename)

    if is_special is False:
        strict = default_strict_decode and not looks_pvl(filename)
        with decompress(filename) as stream:
            label = trim_label(stream, max_size, strict_decode=strict)
    return parse_pvl(label, deduplicate_pointers)

set_key_index(pointer_range: list[int], key: str) -> str

utility setter function for multidict_dig_and_edit() as called by index_duplicate_pointers(); appends a number from a list to a string

Source code in pdr/parselabel/pds3.py
434
435
436
437
438
439
440
441
def set_key_index(pointer_range: list[int], key: str) -> str:
    """
    utility setter function for `multidict_dig_and_edit()` as called by
    `index_duplicate_pointers()`; appends a number from a list to a string
    """
    indexed_key = f"{key}_{pointer_range[0]}"
    pointer_range.pop(0)
    return indexed_key

parselabel.pds4

Simple utilities for preprocessing pds4_tools-produced label objects for the pdr.Metadata constructor.

reformat_pds4_tools_label(label: 'Label') -> tuple[MultiDict, list[str]]

Convert a pds4_tools Label object into a MultiDict and a list of parameters suitable for constructing a pdr.Metadata object. This is not just a type conversion; it also rearranges some nested data structures (in particular, repeated child elements become multiple keys of a MultiDict rather than a list of OrderedDicts).

Source code in pdr/parselabel/pds4.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def reformat_pds4_tools_label(label: "Label") -> tuple[MultiDict, list[str]]:
    """
    Convert a pds4_tools Label object into a MultiDict and a list of parameters
    suitable for constructing a pdr.Metadata object. This is not just a type
    conversion; it also rearranges some nested data structures (in particular,
    repeated child elements become multiple keys of a MultiDict rather than
    a list of OrderedDicts).
    """
    unpacked = unpack_to_multidict(label.to_dict(), (OrderedDict, MultiDict))
    # collect all keys to populate pdr.Metadata's fieldcounts attribute
    params = dig_for_keys(
        unpacked, None, base_pred=constant(True), mtypes=(MultiDict,)
    )
    return unpacked, params

unpack_to_multidict(packed: Mapping, mtypes: tuple[type, ...] = (dict,)) -> MultiDict

Recursively unpack any Mapping into a MultiDict. Unpacks all list or tuple values at any level into multiple keys at that level. This is an unusual- sounding behavior but is generally appropriate for PDS4 labels, and specifically for the pds4_tools representation of XML labels. PDS4 types with cardinality > 1 always (?) represent multiple distinct entities / properties rather than an array of properties. The list can also always be retrieved from the resulting multidict with MultiDict.get_all().

Example:

>>> unpack_to_multidict({'a': 1, 'b': [{'c': 2}, 3]})
<MultiDict('a': 1, 'b': <MultiDict('c': 2)>, 'b': 3)>
Source code in pdr/parselabel/pds4.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def unpack_to_multidict(
    packed: Mapping, mtypes: tuple[type, ...] = (dict,)
) -> MultiDict:
    """
    Recursively unpack any Mapping into a MultiDict. Unpacks all list or tuple
    values at any level into multiple keys at that level. This is an unusual-
    sounding behavior but is generally appropriate for PDS4 labels, and
    specifically for the pds4_tools representation of XML labels. PDS4 types
    with cardinality > 1 always (?) represent multiple distinct entities /
    properties rather than an array of properties. The list can also always be
    retrieved from the resulting multidict with `MultiDict.get_all()`.

    Example:
    ```
    >>> unpack_to_multidict({'a': 1, 'b': [{'c': 2}, 3]})
    <MultiDict('a': 1, 'b': <MultiDict('c': 2)>, 'b': 3)>
    ```
    """
    unpacked, items = MultiDict(), list(reversed(packed.items()))
    while len(items) > 0:
        k, v = items.pop()
        if isinstance(v, (list, tuple)):
            items += [(k, e) for e in reversed(v)]
        elif isinstance(v, mtypes):
            unpacked.add(k, unpack_to_multidict(v, mtypes))
        else:
            unpacked.add(k, v)
    return unpacked

parselabel.utils

DEFAULT_PVL_LIMIT = 1000 * 1024 module-attribute

heuristic for max label size. we know it's not a real rule.

KNOWN_LABEL_ENDINGS = (re.compile(b'\nEND {0,8}(\r| {8})'), re.compile(b'\x00{3}'), b'\nEND\n') module-attribute

Fast regex patterns for generic PVL label endings. They work for almost all PVL labels in the PDS.

_scan_to_end_of_label(buf: IO, max_size: int, text: bytes, raise_no_ending: bool)

Subroutine of trim_label()

Source code in pdr/parselabel/utils.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def _scan_to_end_of_label(
    buf: IO, max_size: int, text: bytes, raise_no_ending: bool
):
    """Subroutine of trim_label()"""
    length = 0
    while length < max_size:
        if (chunk := buf.read(50 * 1024)) == b'':
            break
        for ending in KNOWN_LABEL_ENDINGS:
            if (endmatch := re.search(ending, chunk)) is not None:
                return text + chunk[: endmatch.span()[1]]
        text, length = text + chunk, length + 50 * 1024
    if raise_no_ending is True:
        raise InvalidAttachedLabel("Couldn't find a label ending.")
    return text

trim_label(fn: Union[IO, Path, str], max_size: int = DEFAULT_PVL_LIMIT, strict_decode: bool = True, raise_no_ending: bool = False, special_encoding: str = 'utf-8') -> str

Look for a PVL label at the top of a file.

Source code in pdr/parselabel/utils.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def trim_label(
    fn: Union[IO, Path, str],
    max_size: int = DEFAULT_PVL_LIMIT,
    strict_decode: bool = True,
    raise_no_ending: bool = False,
    special_encoding: str = "utf-8"
) -> str:
    """Look for a PVL label at the top of a file."""
    target_is_fn = isinstance(fn, (Path, str))
    try:
        if target_is_fn is True:
            fn = open(fn, 'rb')
        text = fn.read(20)
        if strict_decode is True:
            try:
                text.decode('ascii')
            except UnicodeDecodeError:
                raise InvalidAttachedLabel("File head appears to be binary.")
        text = _scan_to_end_of_label(fn, max_size, text, raise_no_ending)
    finally:
        if target_is_fn is True:
            fn.close()
    policy = "strict" if strict_decode is True else "replace"
    try:
        return text.decode(special_encoding, errors=policy)
    except UnicodeDecodeError:
        raise InvalidAttachedLabel("Invalid characters in label.")

pd_utils

Methods for working with pandas objects, primarily intended for use in TABLE/ARRAY/SPREADSHEET/HISTOGRAM-loading workflows.

_apply_item_offsets(fmtdef: pd.DataFrame) -> pd.Series

Select item offsets (for a column or container with multiple items). If the specification didn't give item offsets, just assume they're equal to the byte width (i.e. there's no variable padding between fields).

Source code in pdr/pd_utils.py
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def _apply_item_offsets(fmtdef: pd.DataFrame) -> pd.Series:
    """
    Select item offsets (for a column or container with multiple items). If
    the specification didn't give item offsets, just assume they're equal to
    the byte width (i.e. there's no variable padding between fields).
    """
    item_offsets = fmtdef["ITEM_BYTES"].copy()
    if "ITEM_OFFSET" not in fmtdef.columns:
        return item_offsets
    offset = fmtdef.loc[fmtdef["ITEM_OFFSET"].notna()]
    if (offset["ITEM_OFFSET"] < offset["ITEM_BYTES"]).any():
        raise ValueError(
            "Don't know how to intepret a field narrower than its value."
        )
    item_offsets.loc[offset.index] = offset["ITEM_OFFSET"]
    return item_offsets

booleanize_booleans(table: pd.DataFrame, fmtdef: pd.DataFrame) -> pd.DataFrame

We generally load boolean columns from binary tables as uint8 of value 0 or 1. This converts all such columns of a DataFrame to np.bool.

Source code in pdr/pd_utils.py
242
243
244
245
246
247
248
249
250
251
def booleanize_booleans(
    table: pd.DataFrame, fmtdef: pd.DataFrame
) -> pd.DataFrame:
    """
    We generally load boolean columns from binary tables as uint8 of value 0
    or 1. This converts all such columns of a DataFrame to np.bool.
    """
    boolean_columns = fmtdef.loc[fmtdef["DATA_TYPE"] == "BOOLEAN", "NAME"]
    table[boolean_columns] = table[boolean_columns].astype(bool)
    return table

compute_offsets(fmtdef: pd.DataFrame) -> pd.DataFrame

PDS3 TABLE/SPREADSHEET/ARRAY specifications do not explicitly give the correct byte offsets for CONTAINERs, COLLECTIONs, anything loaded in by reference from a STRUCTURE, or repeated elements of a COLUMN. Byte offsets in these cases always refer to their parent containers, which can repeat, have children with their own repetitions, etc., etc. This function 'unpacks' a format definition as necessary and adds an SB_OFFSET column giving the correct byte offsets (from record start) for each field of the data table/array.

Source code in pdr/pd_utils.py
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
def compute_offsets(fmtdef: pd.DataFrame) -> pd.DataFrame:
    """
    PDS3 TABLE/SPREADSHEET/ARRAY specifications do not explicitly give the
    correct byte offsets for CONTAINERs, COLLECTIONs, anything loaded in by
    reference from a STRUCTURE, or repeated elements of a COLUMN. Byte offsets
    in these cases always refer to their parent containers, which can repeat,
    have children with their own repetitions, etc., etc. This function
    'unpacks' a format definition as necessary and adds an SB_OFFSET column
    giving the correct byte offsets (from record start) for each field of the
    data table/array.
    """
    # START_BYTE is 1-indexed, but we're preparing these offsets for
    # numpy, which 0-indexes
    fmtdef["SB_OFFSET"] = fmtdef["START_BYTE"].astype(int) - 1
    if "ROW_PREFIX_BYTES" in fmtdef.columns:
        fmtdef["SB_OFFSET"] += fmtdef["ROW_PREFIX_BYTES"]
    block_names = fmtdef.loc[
        fmtdef['NAME'] != "PLACEHOLDER_0", "BLOCK_NAME"
    ].unique()
    # calculate offsets for formats loaded in by reference
    for block_name in block_names[1:]:
        if block_name in ("PLACEHOLDER_None", f"PLACEHOLDER_{block_names[0]}"):
            continue
        fmt_block = fmtdef.loc[fmtdef["BLOCK_NAME"] == block_name]
        if "PLACEHOLDER" in block_name:
            prior = fmtdef[fmtdef["NAME"] == block_name].squeeze()
        else:
            prior = fmtdef.loc[fmt_block.index[0] - 1]
        fmtdef.loc[fmt_block.index, "SB_OFFSET"] += (
            prior["SB_OFFSET"] + prior["BYTES"]
        )
        if "ROW_PREFIX_BYTES" in fmtdef.columns:
            fmtdef.loc[fmt_block.index, "SB_OFFSET"] -= fmtdef["ROW_PREFIX_BYTES"]
        count = fmt_block["BLOCK_REPETITIONS"].iloc[0]
        if (reps := prior["BLOCK_REPETITIONS"]) > 1:
            if "PLACEHOLDER" in block_name:
                fmtdef.loc[fmt_block.index, "BLOCK_REPETITIONS"] *= reps
            else:
                count *= reps
        if count == 1:
            continue
        chunks = tuple(map(list, divide(count, fmt_block.index)))
        block_size = fmt_block['BLOCK_BYTES'].iloc[0]
        if block_size != int(block_size):
            raise NotImplementedError("irregular repeated container size.")
        block_size = int(block_size)
        offset_chain = chain(
            *[[i for _ in c] for (i, c) in enumerate(chunks)]
        )
        fmtdef.loc[
            fmt_block.index, "SB_OFFSET"
        ] += np.array(list(offset_chain)) * block_size
    # correctly compute offsets within columns w/multiple items
    if "ITEM_BYTES" in fmtdef.columns:
        fmtdef["ITEM_SIZE"] = _apply_item_offsets(fmtdef)
        column_groups = fmtdef.loc[fmtdef["ITEM_SIZE"].notna()]
        group_offs = column_groups['SB_OFFSET'].value_counts().sort_index()
        gix_list, position = [], 0
        for off, gl in zip(group_offs.index, group_offs.values):
            itemsize = int(column_groups['ITEM_SIZE'].iloc[position])
            gix_list += [(i * itemsize) + off for i in range(gl)]
            position += gl
        fmtdef.loc[column_groups.index, 'SB_OFFSET'] = gix_list
    pad_length = 0
    end_byte = fmtdef["SB_OFFSET"].iloc[-1] + fmtdef["BYTES"].iloc[-1]
    if "ROW_BYTES" in fmtdef.columns:
        pad_length += fmtdef["ROW_BYTES"].iloc[0] - end_byte
    if "ROW_SUFFIX_BYTES" in fmtdef.columns:
        pad_length += fmtdef["ROW_SUFFIX_BYTES"].iloc[0]
    if pad_length > 0:
        placeholder_rec = {
            "NAME": "PLACEHOLDER_0",
            "DATA_TYPE": "VOID",
            "BYTES": pad_length,
            "START_BYTE": end_byte,
            "SB_OFFSET": end_byte,
        }
        fmtdef = pd.concat(
            [fmtdef, pd.DataFrame([placeholder_rec])]
        ).reset_index(drop=True)
    return fmtdef

construct_nested_array_format(fmtdef: pd.DataFrame) -> pd.DataFrame

ARRAY objects can be deeply nested. This function computes the correct byte offsets and dtypes (including array shape) for any nested subelements.

Source code in pdr/pd_utils.py
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
def construct_nested_array_format(fmtdef: pd.DataFrame) -> pd.DataFrame:
    """
    ARRAY objects can be deeply nested. This function computes the correct
    byte offsets and dtypes (including array shape) for any nested subelements.
    """
    block_names = (
        fmtdef.loc[fmtdef["NAME"] != "PLACEHOLDER_0", "BLOCK_NAME"]
        .unique()[1:]
    )
    for block_name in block_names:
        if block_name == "":
            continue
        fmt_block = fmtdef.loc[fmtdef["BLOCK_NAME"] == block_name]
        if fmt_block.empty:
            continue
        prior_idx = fmt_block.index[0] - 1
        prior = fmtdef.loc[prior_idx]
        axis_items = prior.get("AXIS_ITEMS", np.nan)
        if pd.isna(axis_items):
            continue
        axis_items = int(axis_items)
        # Compute the dtype using a copy of the block with SB_OFFSET adjusted,
        # instead of mutating a view-backed slice (SettingWithCopy).
        fmt_block_rel = fmt_block.copy()
        fmt_block_rel.loc[
            :, "SB_OFFSET"
        ] = fmt_block_rel["SB_OFFSET"] - prior["SB_OFFSET"]

        dt = fmtdef_to_dtype(fmt_block_rel)
        fmtdef.at[prior_idx, "dt"] = (dt, axis_items)
        fmtdef = fmtdef.loc[~fmtdef["NAME"].isin(fmt_block_rel["NAME"])]

    return fmtdef

convert_ebcdic(table: pd.DataFrame, fmtdef: pd.DataFrame) -> pd.DataFrame

Decode any columns of a DataFrame that contain bytestrings constructed from IBM S/360-style EBCDIC-encoded text to Python strings.

Source code in pdr/pd_utils.py
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
def convert_ebcdic(
    table: pd.DataFrame, fmtdef: pd.DataFrame
) -> pd.DataFrame:
    """
    Decode any columns of a DataFrame that contain bytestrings constructed from
    IBM S/360-style EBCDIC-encoded text to Python strings.
    """
    ebcdic_columns = fmtdef.loc[
        fmtdef["DATA_TYPE"].str.contains("EBCDIC"), "NAME"
    ]
    for col in ebcdic_columns:
        # TODO: why do we copy table[col] twice?
        series = pd.Series(table[col])
        table[col] = series.str.decode('cp500')
    return table

convert_ibm_reals(df: pd.DataFrame, fmtdef: pd.DataFrame) -> pd.DataFrame

Converts all IBM reals in a dataframe from packed 32- or 64-bit integer form to np.float32 or np.float64.

Source code in pdr/pd_utils.py
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
def convert_ibm_reals(df: pd.DataFrame, fmtdef: pd.DataFrame) -> pd.DataFrame:
    """
    Converts all IBM reals in a dataframe from packed 32- or 64-bit integer
    form to np.float32 or np.float64.
    """
    if not fmtdef['DATA_TYPE'].str.contains('IBM').any():
        return df
    reals = {}
    for _, field in fmtdef.iterrows():
        if not re.match(r'IBM.*REAL', field['DATA_TYPE']):
            continue
        func = ibm32_to_np_f32 if field['BYTES'] == 4 else ibm64_to_np_f64
        converted = func(df[field['NAME']].values)
        if field['BYTES'] == 4:
            # IBM shorts are wider-range than IEEE shorts; check if we can
            # safely cast them back down to float32
            absolute = abs(converted)
            big = absolute.max() > np.finfo(np.float32).max
            nonzero = absolute[absolute > 0]
            if len(nonzero) > 0:
                small = nonzero.min() < 1e-44
            else:
                small = False
            if not (big or small):
                converted = converted.astype(np.float32)
        reals[field['NAME']] = converted
        # IBM longs just get more precise, not wider-ranged, so we don't need
        # to check for longlong or anything like that
    for k, v in reals.items():
        df[k] = v
    return df

convert_vax_reals(data: pd.DataFrame, properties: pd.DataFrame) -> pd.DataFrame

If any columns in a DataFrame are in 32-bit VAX real format, convert them to 32-bit float.

Source code in pdr/pd_utils.py
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
def convert_vax_reals(data: pd.DataFrame, properties: pd.DataFrame) -> pd.DataFrame:
    """If any columns in a DataFrame are in 32-bit VAX real format,
    convert them to 32-bit float."""
    if not properties['DATA_TYPE'].str.contains('VAX').any():
        return data
    reals = {}
    for _, field in properties.iterrows():
        if not re.match(r'VAX.*REAL', field['DATA_TYPE']):
            continue
        func = vax.from_vax32  # TODO: if field['BYTES'] == 4 else vax.from_vax64
        converted = func(data[field['NAME']].values)
        reals[field['NAME']] = converted
    for k, v in reals.items():
        data[k] = v
    return data

fmtdef_to_dtype(fmtdef: pd.DataFrame) -> np.dtype

Construct a structured (but ideally never nested, see construct_nested_array_format() below) dtype from a format definition.

Source code in pdr/pd_utils.py
195
196
197
198
199
200
201
202
203
204
def fmtdef_to_dtype(fmtdef: pd.DataFrame) -> np.dtype:
    """
    Construct a structured (but ideally never nested, see
    `construct_nested_array_format()` below) dtype from a format definition.
    """
    dtype_spec = fmtdef[
        [c for c in ("NAME", "dt", "SB_OFFSET") if c in fmtdef.columns]
    ].to_dict("list")
    spec_keys = ("names", "formats", "offsets")[: len(dtype_spec)]
    return np.dtype({k: v for k, v in zip(spec_keys, dtype_spec.values())})

insert_sample_types_into_df(fmtdef: pd.DataFrame, identifiers: DataIdentifiers) -> tuple[pd.DataFrame, np.dtype]

Insert numpy-compatible data type strings into a TABLE/ARRAY format definition DataFrame. Also generate a numpy dtype object from that DataFrame.

Source code in pdr/pd_utils.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
def insert_sample_types_into_df(
    fmtdef: pd.DataFrame, identifiers: DataIdentifiers
) -> tuple[pd.DataFrame, np.dtype]:
    """
    Insert numpy-compatible data type strings into a TABLE/ARRAY format
    definition DataFrame. Also generate a numpy dtype object from that
    DataFrame.
    """
    fmtdef["dt"] = None
    if "ITEM_BYTES" not in fmtdef.columns:
        fmtdef["ITEM_BYTES"] = np.nan
    data_types = tuple(
        fmtdef.groupby(["DATA_TYPE", "ITEM_BYTES", "BYTES"], dropna=False)
    )
    for data_type, group in data_types:
        dt, item_bytes, total_bytes = data_type
        sample_bytes = total_bytes if np.isnan(item_bytes) else item_bytes
        try:
            samp_info = {"SAMPLE_TYPE": dt, "BYTES_PER_PIXEL": sample_bytes}
            is_special, special_type = check_special_sample_type(
                identifiers, samp_info
            )
            if is_special:
                fmtdef.loc[group.index, "dt"] = special_type
            else:
                fmtdef.loc[group.index, "dt"] = sample_types(
                    dt, int(sample_bytes), for_numpy=True
                )
        except KeyError:
            raise KeyError(
                f"{data_type} is not a currently-supported data type."
            )
    if "BLOCK_NAME" in fmtdef.columns:
        fmtdef = construct_nested_array_format(fmtdef)
    dt = fmtdef_to_dtype(fmtdef)
    return fmtdef, dt

numeric_columns(df: pd.DataFrame) -> list[Hashable]

Return names of all 'numeric' columns in a DataFrame.

Source code in pdr/pd_utils.py
27
28
29
30
31
32
33
def numeric_columns(df: pd.DataFrame) -> list[Hashable]:
    """Return names of all 'numeric' columns in a DataFrame."""
    return [
        col
        for col, dtype in df.dtypes.iteritems()
        if pandas.api.types.is_numeric_dtype(dtype)
    ]

rectified_rec_df(array: np.ndarray) -> pd.DataFrame

Attempt to 'flatten' a 1- or 2D ndarray, possibly with a structured dtype but with no nested arrays, into a DataFrame, typecasting as necessary for pandas compatibility.

Source code in pdr/pd_utils.py
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
def rectified_rec_df(array: np.ndarray) -> pd.DataFrame:
    """
    Attempt to 'flatten' a 1- or 2D ndarray, possibly with a structured dtype
    but with no nested arrays, into a DataFrame, typecasting as necessary for
    pandas compatibility.
    """
    if len(array.shape) == 3:
        # it's possible to pack 2D arrays into individual records. this
        # obviously does not work for pandas. if we encounter > 2D elements,
        # we can generalize this.
        array = array.reshape(array.shape[0], array.shape[1] * array.shape[2])
    elif len(array.shape) > 3:
        raise NotImplementedError("dtypes with >2D elements are not supported")
    if len(array.dtype) == 0:
        # if it doesn't have a structured dtype, don't call from_records --
        # it's slow and acts weird
        return pd.DataFrame(enforce_order_and_object(array))
    # but if it does, do
    return pd.DataFrame.from_records(enforce_order_and_object(array))

reindex_df_values(df: pd.DataFrame, column='NAME') -> pd.DataFrame

give unique string identifiers to every value in a particular column of a DataFrame by appending an underscore and an incrementing number if necessary.

include START_BYTE in string for values marked as RESERVED.

Source code in pdr/pd_utils.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def reindex_df_values(df: pd.DataFrame, column="NAME") -> pd.DataFrame:
    """
    give unique string identifiers to every value in a particular column of
    a DataFrame by appending an underscore and an incrementing number if
    necessary.

    include START_BYTE in string for values marked as RESERVED.
    """
    namegroups = df.groupby(column)
    for name, field_group in namegroups:
        if len(field_group) == 1:
            continue
        # TODO: check what this is hitting.
        if name == "RESERVED":
            name = f"RESERVED_{field_group['START_BYTE'].iloc[0]}"
        names = [f"{name}_{ix}" for ix in range(len(field_group))]
        df.loc[field_group.index, column] = names
    return df

structured_array_to_df(array: np.ndarray) -> pd.DataFrame

Attempt to convert an ndarray with a structured dtype to a DataFrame, flattening any nested 1- or 2-D arrays into blocks of columns and typecasting as necessary for pandas compatibility. This does not attempt to flatten nested elements with dimensionality > 2, and will raise a NotImplementedError if it encounters them.

Source code in pdr/pd_utils.py
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
def structured_array_to_df(array: np.ndarray) -> pd.DataFrame:
    """
    Attempt to convert an ndarray with a structured dtype to a DataFrame,
    flattening any nested 1- or 2-D arrays into blocks of columns and
    typecasting as necessary for pandas compatibility. This does not attempt
    to flatten nested elements with dimensionality > 2, and will raise a
    NotImplementedError if it encounters them.
    """
    sub_dfs = []
    name_buffer = []
    for field in array.dtype.descr:
        if len(field) == 2:
            name_buffer.append(field[0])
        else:
            if len(name_buffer) > 0:
                sub_dfs.append(rectified_rec_df(array[name_buffer]))
                name_buffer = []
            sub_df = rectified_rec_df(array[field[0]])
            sub_df.columns = [
                f"{field[0]}_{ix}" for ix in range(len(sub_df.columns))
            ]
            sub_dfs.append(sub_df)
    if len(name_buffer) > 0:
        sub_dfs.append(rectified_rec_df(array[name_buffer]))
    if len(sub_dfs) == 1:
        return sub_dfs[0]
    return pd.concat(sub_dfs, axis=1)

pdr

Data

Core pdr class.

Source code in pdr/pdr.py
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
class Data:
    """Core `pdr` class."""
    def __init__(
        self,
        fn: Union[Path, str],
        *,
        debug: bool = False,
        label_fn: Optional[Union[Path, str]] = None,
        search_paths: Union[Collection[str], str] = (),
        skip_existence_check: bool = False,
        pvl_limit: int = DEFAULT_PVL_LIMIT,
        tracker: Optional[TrivialTracker] = None,
        strict_label_decode: bool = True
    ):
        """"""
        # Bail out early if someone's trying to load directly from the network.
        if isinstance(fn, str) and re.match("(?i)^(?:https?|ftp):", fn):
            raise ValueError("Read-from-url is not currently implemented.")

        # list of the product's associated data objects
        self.index = []
        # do we raise an exception rather than a warning if loading a data
        # object fails?
        self.debug = debug
        self.filename = check_cases(Path(fn).absolute(), skip_existence_check)
        self.loaders = {}
        if (self.debug is True) and (tracker is None):
            self.tracker = Tracker(
                Path(self.filename).name.replace(".", "_"),
                outdir=Path(__file__).parent / ".tracker_logs",
            )
            self.tracker.clear()
        elif tracker is None:
            self.tracker = TrivialTracker()
        else:
            self.tracker = tracker
        # mappings from data objects to local paths
        self.file_mapping = {}
        # known special constants per data object
        self.specials = {}
        # dict to flag images loaded prescaled (currently only from FITS files)
        self._scaleflags = {}
        # where can we look for files containing data objects?
        # not yet fully implemented; only uses first (automatic) one.
        self.search_paths = [self._init_search_path()] + listify(search_paths)
        self.standard = None
        # cache for pds4_tools.reader.general_objects.Structure objects.
        self._pds4_structures = None
        # cache for hdulist, for primary FITS files -- this is primarily
        # an optimization for compressed files
        self._hdulist = None
        # dict of [str, int] for HDU name / position in primary FITS files,
        # for reindexing duplicates and efficiency
        self._hdumap = None
        # data structure recording interleaved PDS3 objects not defined at top
        # level, intended for things like axplanes and line prefix tables
        self._interleaved_objects = {}
        # Attempt to identify and assign a label file
        self.labelname = associate_label_file(
            self.filename, label_fn, skip_existence_check
        )
        # if unlabeled, check to see if we can read it in a non-PDS format
        if self.labelname is None:
            primary_format = check_primary_fmt(self.filename)
        elif (fmt := check_primary_fmt(self.labelname)) is not None:
            primary_format = fmt
        else:
            primary_format = None
        if primary_format is not None:
            self.standard = primary_format
            if self.standard == "FITS":
                from astropy.io import fits

                # TODO: bad. need to not leave this open, although inefficient
                self._hdulist = fits.open(self.filename)
        elif (
            str(self.labelname).endswith(".xml") 
            or str(self.labelname).endswith(".lblx")
            or ("CE" in str(self.labelname) and
                str(self.labelname).endswith((".2BL", ".2AL", ".2CL", ".01L")))
        ):
            self.standard = "PDS4"
            self._pds4_structures = {}
            self._init_pds4()
        else:
            self.standard = "PDS3"
        try:
            self.metadata = self.read_metadata(
                pvl_limit=pvl_limit, strict_decode=strict_label_decode
            )
        except (UnicodeError, FileNotFoundError) as ex:
            raise ValueError(
                f"Can't load this product's metadata: {ex}, {type(ex)}"
            )
        self.load_metadata_changes()
        if self.standard == "PDS4":
            return
        if primary_format is not None:
            self._init_primary_format()
            return
        self.identifiers = self.metadata.identifiers
        self.pointers = get_pds3_pointers(self.metadata)
        # if self.pointers is None, we've probably got a weird edge case where
        # someone directly opened a PVL file that's not an individual product
        # label (e.g. a format file or a non-PDS PVL file) -- but there's no
        # reason to not allow them to use PDR as a PVL parser.
        if self.pointers is not None:
            self._find_objects()

    # noinspection PyProtectedMember
    def load_metadata_changes(self):
        if "_metaget_interior" in dir(self):
            self.metadata.refresh_cache()
        self._metaget_interior = self.metadata._metaget_interior
        self._metablock_interior = self.metadata._metablock_interior

    def _init_pds4(self):
        """use pds4_tools to open pds4 files, but in our interface idiom."""

        import pdr.pds4_tools as pds4

        structure_list = pds4.read(
            self.labelname, lazy_load=True, quiet=True, no_scale=True
        )
        for structure in structure_list.structures:
            self._pds4_structures[structure.id.replace(" ", "_")] = structure
            self.index.append(structure.id.replace(" ", "_"))
        self._pds4_structures["label"] = structure_list.label
        self.index.append("label")

    def _init_search_path(self) -> str:
        """
        Set initial path this object will check for additional files (just the
        directory that contains its "primary" file).
        """
        for target in ("labelname", "filename"):
            if (target in dir(self)) and (target is not None):
                return str(Path(self.getattr(target)).absolute().parent)
        raise FileNotFoundError

    def _associate_prefix_tables(self, imname, preobjs):
        """
        Check for underspecified line prefix table objects associated with a
        PDS3 image specification.
        """
        block = self.metablock_(imname)
        # TODO, maybe: do a special case check against the name. This is only
        #  important if we ever have a special object name case that is
        #  relevant. (currently we do not.)
        if block is None:
            return
        if len(spointers := tuple(filter(STRUCTUREPAT.match, block))) == 0:
            return
        elif len(spointers) > 1:
            # hopefully this never happens. we will handle it if so
            raise NotImplementedError(
                "Multiple implicitly-defined line prefix tables within a "
                "single object are not supported."
            )
        # check for a matching prefix table defined at top level with no
        # explicit block
        if len(preobjs) > 0:
            from pdr.loaders.queries import standalone_start_byte

            im_fn_byte = standalone_start_byte(self, imname)
            found = False
            for pre in preobjs:
                if standalone_start_byte(self, pre) == im_fn_byte:
                    self._interleaved_objects[pre] = {
                        'parent': imname, 'type': 'line_prefix_table'
                    }
                    found = True
            if found is True:
                return
        # TODO: unclear if this should be a special case or a general fix. Need 
        # more examples to know for sure; Galileo SSI-4-REDR IMAGE objects are 
        # the current known example.
        # For cases when ^LINE_PREFIX_STRUCTURE = "" 
        if (
            "^LINE_PREFIX_STRUCTURE" in block 
            and block["^LINE_PREFIX_STRUCTURE"] == ""
        ):
            return
        # TODO, maybe: technically this could be a line suffix table,
        #  although we have never found them. We could add a check.
        fixname = f'{imname}_LINE_PREFIX_TABLE'
        self._interleaved_objects[fixname] = {
            'parent': imname, 'type': 'line_prefix_table'
        }
        self.index.append(fixname)

    def _find_objects(self):
        """
        Add all top-level data objects mentioned in the label to this object's
        index, except for 'trivial' ones (see `loaders.utility.is_trivial()`).
        Also check for interleaved objects not defined at top level (such as
        some line prefix tables).

        TODO: check for ISIS-style axplane objects.
        """
        from pdr.loaders.utility import is_trivial
        from pdr.formats.checkers import check_special_objects

        is_special, special_objects = check_special_objects(self.identifiers)
        if is_special is True:
            self.index += special_objects
            return

        # TODO: make this not add objects again if called multiple times
        for pointer in self.pointers:
            imname = depointerize(pointer)
            if is_trivial(imname):
                continue
            self.index.append(imname)
        # check for poorly / implicitly-defined line prefix tables
        # top-level line prefix objects with no blocks of their own
        preobjs = [
            n for n in self.index
            if "LINE_PREFIX" in n and self.metablock_(n) is None
        ]
        for imname in tuple(
            # greedily consuming the filter so that we can mutate self.index
            filter(lambda n: "IMAGE" in n and "TABLE" not in n, self.index)
        ):
            self._associate_prefix_tables(imname, preobjs)

    def _object_to_filename(
        self, object_name: str
    ) -> Union[str, list[str], Optional[tuple[Path, ...]]]:
        """
        Construct one or more on-disk search paths for the file that contains
        a named data object. Does not actually check if files exist at those
        paths (typically performed by calls to `utils.check_cases()).
        """
        is_special, special_target = check_special_fn(
            self, object_name, self.identifiers
        )
        if is_special is True:
            return self.get_absolute_paths(special_target)
        is_comp, comp_paths = self._check_compressed_file_pointer(object_name)
        if is_comp is True:
            return comp_paths
        target = self.metaget_(pointerize(object_name))
        if isinstance(target, Sequence) and not (isinstance(target, str)):
            if isinstance(target[0], str):
                target = target[0]
        if isinstance(target, str):
            return self.get_absolute_paths(target)
        else:
            return self.filename

    def _check_compressed_file_pointer(
        self, object_name: str
    ) -> tuple[bool, Optional[tuple[Path, ...]]]:
        """
        When PDS3 labels describe data objects in compressed files, they often
        give the names that the compressed files _would_ have, were someone to
        decompress them, as the physical locations of those objects. This can
        be confusing, because you cannot load an object from a merely
        hypothetical file.

        However, this is by no means a strict convention, so we can't just
        assume that it's the case -- we have to check all the file names
        mentioned for that object in the label, including those not given as
        top-level pointers.
        """
        compkeys = {"COMPRESSED_FILE", "UNCOMPRESSED_FILE"}
        if (
            len(compkeys.intersection(self.metadata.keys())) == 2
            and object_name in self.metablock_("UNCOMPRESSED_FILE").keys()
        ):
            blocks = filter(None, [self.metaget_(k) for k in compkeys])
            filenames = filter(None, [b.get("FILE_NAME") for b in blocks])
            return True, tuple(
                chain.from_iterable(map(self.get_absolute_paths, filenames))
            )
        return False, None

    def _target_path(
        self,
        object_name: str,
        cached: bool = True,
        raise_missing: bool = False
    ) -> Optional[Union[Path, list[Path], str]]:
        """
        Considering all known search paths and treating filenames as
        case-insensitive, attempt to find a filesystem path to a
        file or files in which a particular named data object might exist.
        This autopopulates self.file_mapping[object_name] if it finds one or
        more files, and by default treats this value as cached on subsequent
        calls (which can improve performance significantly, especially on
        networked filesystems).
        """
        if cached is True and (self.file_mapping.get(object_name) is not None):
            return self.file_mapping[object_name]
        try:
            if isinstance(object_name, set):
                file_list = [
                    self._target_path(obj, cached=cached, raise_missing=raise_missing)
                    for obj in object_name
                ]
                self.file_mapping[object_name] = file_list
                return file_list
            path = check_cases(self._object_to_filename(object_name))
            self.file_mapping[object_name] = path
            return path
        except FileNotFoundError:
            if raise_missing is True:
                raise
            return None

    def unloaded(self) -> tuple[str]:
        """Return names of all identified but unloaded data objects."""
        return tuple(filter(lambda k: k not in dir(self), self.index))

    def load(self, name: str, reload: bool = False, **load_kwargs: Any):
        """
        Explicitly load an identified data object by name; alternatively
        `name="all"` means "load every identified object". Does not return the
        object; just assigns it to the `name` attribute of `self`. The
        `Data.__getitem__()` interface lazy-loads by calling this function
        with default arguments in response to `data['NOTYETLOADED']` etc.
        """
        # prelude: don't try to load nonexistent keys; facilitate
        # load-everything behavior; don't reload by default
        if (name != "all") and (name not in self.index):
            raise KeyError(f"{name} not found in index: {self.index}.")
        if name == "all":
            return self.load_all()
        if (name in dir(self)) and (reload is False):
            raise AlreadyLoadedError(
                f"{name} is already loaded; pass reload=True to "
                f"force reload."
            )
        if self.standard == "PDS4":
            return self._load_pds4(name)
        if self.standard == "FITS":
            self._add_loaded_objects(self._load_primary_fits(name))
            return
        if self.standard in DESKTOP_IMAGE_STANDARDS:
            from pdr.loaders.handlers import handle_compressed_image

            if self.metaget("n_frames", 1) == 1:
                self._add_loaded_objects(
                    {name: handle_compressed_image(self.filename)}
                )
                return
            # TODO: hacky!
            if self.standard == 'MPO' and name == 'IMAGE':
                seek = 0
            else:
                seek = int(name.split("_")[-1])
            self._add_loaded_objects(
                {name: handle_compressed_image(self.filename, seek)}
            )
            return
        if self.file_mapping.get(name) is None:
            target = self._target_path(name)
            if target is None:
                return self._file_not_found(name)
            self.file_mapping[name] = target
        try:
            obj = self.load_from_pointer(name, **load_kwargs)
            if obj is None:
                return
            if not isinstance(obj, dict):
                raise TypeError(
                    f"loader returned non-dict object of type ({type(obj)}"
                )
            self._add_loaded_objects(obj)
            return
        except DebugExceptionPreempted:
            pass
        except KeyboardInterrupt:
            raise
        except NotImplementedError as ex:
            warnings.warn(f"This product's {name} is not yet supported: {ex}.")
        except FileNotFoundError as _ex:
            warnings.warn(f"Unable to find files required by {name}.")
        except Exception as ex:
            warnings.warn(f"Unable to load {name}: {ex}")
        setattr(self, name, self.metaget_(name))

    def _add_loaded_objects(self, obj: Mapping[str, Any]):
        """Helper for `load()`. Ingests objects returned by a `Loader`."""
        for k, v in obj.items():
            if v is not None:
                setattr(self, k, v)
                if k not in self.index:
                    self.index.append(k)

    def load_all(self):
        """Handler (and alias) for `Data.load("all")`."""
        from pdr.loaders.dispatch import OBJECTS_IGNORED_BY_DEFAULT

        for name in self.keys():
            if OBJECTS_IGNORED_BY_DEFAULT.match(name):
                continue
            try:
                self.load(name)
            except AlreadyLoadedError:
                continue

    def _file_not_found(self, object_name: str):
        """Implements default file-not-found behavior."""
        message = (
            f"{object_name} file {self._object_to_filename(object_name)} "
            f"not found in path."
        )
        if self.debug:
            raise FileNotFoundError(message)
        else:
            warnings.warn(message)
        setattr(self, object_name, self.metaget_(object_name))

    def _load_primary_fits(
        self, object_name: str
    ) -> Union[np.ndarray, pd.DataFrame, None]:
        """Handle loading an HDU from a FITS file in "primary" FITS mode."""
        from pdr.loaders.handlers import handle_fits_file

        obj = handle_fits_file(
            self.filename,
            object_name,
            self._hdumap[object_name],
            self._hdulist,
            hdu_id_is_index=True
        )
        if obj.__class__.__name__ == "ndarray":
            self._scaleflags[object_name] = True
        return obj

    def _init_primary_format(self):
        """
        Initialization handler for "primary" format modes (cases in which
        `Data` offers an interface to a file or files in a standard format).
        Currently only supports FITS and 'desktop' image formats.
        """
        if self.standard == "FITS":
            for k in self.metadata.keys():
                self.index.append(k)
            return
        elif self.standard in DESKTOP_IMAGE_STANDARDS:
            return self._add_compressed_image_objects()
        raise NotImplementedError(f"unrecognized standard {self.standard}")

    def _add_compressed_image_objects(self):
        if (nframes := self.metaget("n_frames", 1)) == 1:
            self.index.append("IMAGE")
            return
        if self.standard in ('GIF', 'WEBP'):
            self.index += [f"FRAME_{i}" for i in range(nframes)]
        elif self.standard == 'MPO':
            mpentries = [d['Attribute'] for d in self.metaget('MPEntry')]
            if mpentries[0]['MPType'] != 'Baseline MP Primary Image':
                raise NotImplementedError("Non-primary first MPO image")
            images = ['IMAGE']
            for i, d in enumerate(mpentries[1:]):
                tname = re.sub(r'[() ]', '_', d['MPType'])
                images.append(f"{tname}_{i + 1}")
            self.index += images
        else:
            raise NotImplementedError(
                f"multiframe {self.standard} images are not yet supported."
            )

    # TODO, maybe: this can result in different keys of self referring to
    #  duplicate header objects, one like "object_name_HEADER" and one like
    #  "HEADER_0", etc. This can also happen for PDS3, so maybe this is just
    #  an acceptable interface idiom.
    def _find_fits_header_pds4_id(self, start_byte: int) -> Optional[str]:
        """
        Given start byte for an HDU's data segment, check to see if the
        PDS4 product associated with self includes that HDU's header as a
        distinct data object with a local identifier. If it is, return the
        PDS4 local identifier of that object. If not, return None.
        """
        from pdr.pds4_tools.reader.header_objects import HeaderStructure

        for k, v in self._pds4_structures.items():
            if not isinstance(v, HeaderStructure):
                # we only require header objects to have the
                # 'object_length' key. label objects also do not
                # have meta_data
                continue
            meta = v.meta_data
            if 'object_length' not in meta.keys():
                continue
            if meta['offset'] + meta['object_length'] == start_byte:
                if 'name' not in meta.keys():
                    return None
                return meta['name'].replace(' ', '_')

        return None

    def _load_pds4(self, object_name: str):
        """
        Load this object however pds4_tools wants to load this object, then
        reformat to DataFrame, expose the array handle in accordance with our
        type conventions, etc.

        If the object is from a FITS file, preempt all that behavior and send
        it to our internal FITS-loading workflow.
        """
        structure = self._pds4_structures[object_name]
        from pdr.pds4_tools.reader.label_objects import Label

        if isinstance(structure, Label):
            setattr(self, "label", structure)
        elif check_special_pds4_cases(structure, self.filename,
                                      object_name) is not None:
            result = check_special_pds4_cases(structure, self.filename,
                                              object_name)
            setattr(self, object_name, result)
        elif check_primary_fmt(structure.parent_filename) == "FITS":
            from pdr.loaders.handlers import handle_fits_file

            offset = structure.meta_data['offset']
            result = handle_fits_file(
                structure.parent_filename, object_name, offset
            )
            if structure.is_header() is True:
                return self._add_loaded_objects(result)
            if (hid := self._find_fits_header_pds4_id(offset)) is not None:
                result[hid] = result.pop(f"{object_name}_HEADER")
            if structure.is_array() is True:
                self._scaleflags[object_name] = True
            self._add_loaded_objects(result)
        elif structure.is_array():
            import numpy as np

            setattr(self, object_name, np.asarray(structure.data))
        elif structure.is_table():
            from pdr.pd_utils import structured_array_to_df

            df = structured_array_to_df(structure.data)
            df.columns = df.columns.str.replace(r"GROUP_?\d+", "", regex=True)
            df.columns = df.columns.str.strip(", ")
            setattr(self, object_name, df)
        # TODO: do other important cases exist?
        else:
            setattr(self, object_name, structure.data)

    def read_metadata(
        self, pvl_limit: int = DEFAULT_PVL_LIMIT, strict_decode: bool = True
    ) -> Metadata:
        """
        Attempt to ingest a product's metadata. if it is a PDS4 product,
        pds4_tools will already have ingested its detached XML label in
        Data._init_pds4(). In that case, simply preprocess it for
        Metadata.__init__.
        Otherwise, if it has a detached PDS3/PVL label, ingest it with
        pdr.parselabel.pds3.read_pvl.
        Then, if we found no detached label, look for an attached PVL
        label (also using read_pvl).
        If we are in a "primary" mode, ignore all that and ingest the product's
        metadata with the appropriate format-specific functions.
        Then, construct a Metadata object from whatever we loaded and add all
        the objects it implies to our index.
        """
        if self.standard == "FITS":
            from pdr.loaders.handlers import unpack_fits_headers

            mapping, params, self._hdumap = unpack_fits_headers(
                self.filename, hdulist=self._hdulist
            )
            return Metadata((mapping, params), standard="FITS")
        if self.standard == "PDS4":
            return Metadata(
                reformat_pds4_tools_label(self.label), standard="PDS4"
            )
        if self.standard in DESKTOP_IMAGE_STANDARDS:
            from pdr.pil_utils import skim_image_data, paramdig

            return Metadata(paramdig(skim_image_data(self.filename)))
        # self.labelname is None means we didn't find a detached label
        target = self.filename if self.labelname is None else self.labelname
        parsed_pvl = read_pvl(
            target, max_size=pvl_limit, default_strict_decode=strict_decode
        )
        metadata = Metadata(parsed_pvl)
        # we wait until after the read step to make these assignments in order
        # to facilitate debugging in cases where there is not in fact an
        # attached label or we couldn't read it
        self.labelname, self.file_mapping["LABEL"] = target, target
        self.index.append("LABEL")
        return metadata

    def load_from_pointer(
        self, pointer: str, **load_kwargs: Any
    ) -> dict[
        str, Union[pd.DataFrame, np.ndarray, str, MultiDict, "PVLModule"]
    ]:
        """
        PDS3 data object-loading handler. Set up the appropriate `Loader` for
        the object, set up load flow tracking, call the loader, and perform
        basic cleanup.
        """
        from pdr.loaders.dispatch import pointer_to_loader

        loader = pointer_to_loader(pointer, self)
        if self.debug is True:
            loader = Dynamic.from_function(loader, optional=True)
        self.loaders[pointer] = loader
        self.tracker.set_metadata(
            filename=self.file_mapping[pointer], obj=pointer
        )
        obj = self.loaders[pointer](
            self, pointer, tracker=self.tracker, **load_kwargs
        )
        # FITS arrays are scaled by default, and most 'desktop' images never
        # require scaling. we currently treat GeoTIFFs and JP2 as the only
        # exceptions.
        # TODO: assess whether there are non-GeoTIFF TIFFs floating around in
        #  the PDS that might still require scaling.
        unwrap = loader.func.__self__ if self.debug is True else loader
        if (
            (unwrap.__class__.__name__ == "ReadFits")
            and (obj[pointer].__class__.__name__ == "ndarray")
        ):
            self._scaleflags[pointer] = True
        if (
            (unwrap.__class__.__name__ == 'ReadCompressedImage')
            and (obj[pointer].__class__.__name__ == "ndarray")
        ):
            from pdr.loaders.handlers import _check_prescaled_desktop

            self._scaleflags[pointer] = _check_prescaled_desktop(
                self.file_mapping[pointer]
            )
        if self.debug is True and len(loader.errors) > 0:
            warnings.warn(
                f"Unable to load {pointer}: {loader.errors[-1]['exception']}"
            )
            raise DebugExceptionPreempted
        return obj

    def get_scaled(
        self,
        object_name: str,
        inplace: bool = False,
        float_dtype: Optional[np.dtype] = None
    ) -> np.ndarray:
        """
        fetches copy of data object corresponding to key, masks special
        constants, then applies any scale and offset specified in the label.
        only relevant to arrays.

        if `inplace` is True, does calculations in-place on original array,
        with attendant memory savings and destructiveness.
        """
        obj = self[object_name]
        # avoid numpy import just for type check
        if obj.__class__.__name__ != "ndarray":
            raise TypeError("get_scaled is only applicable to arrays.")
        if self._scaleflags.get(object_name) is True:
            return obj
        if self.standard == "PDS4":
            from pdr._scaling import scale_pds4_tools_struct

            # Do whatever pds4_tools would most likely do with these data.
            # TODO: shake this out much more vigorously. perhaps make
            #  the inplace and float_dtype arguments do something.
            #  check and see if implicit special constants ever still exist
            #  stealthily in PDS4 data. etc.
            return scale_pds4_tools_struct(self._pds4_structures[object_name])

        # TODO: double-check that astropy is successfully handling masking
        # TODO: most 'desktop' image formats should never contain special
        #  constants, but some (e.g. JP2) may be able to? check.
        if self.standard != "PDS3":
            return obj

        from pdr._scaling import mask_specials, scale_array

        if object_name not in self.specials:
            self.specials[object_name] = self.find_special_constants(
                object_name
            )
        if self.specials[object_name] != {}:
            obj = mask_specials(obj, list(self.specials[object_name].values()))
        return scale_array(self, obj, object_name, inplace, float_dtype)

    def find_special_constants(self, object_name: str) -> dict[str, Number]:
        """
        look up or infer special constants for one of our data objects.
        in general, only works well on ndarrays.
        """
        if len(consts := special_image_constants(self.identifiers)) > 0:
            return consts

        from pdr._scaling import find_special_constants

        return find_special_constants(self, self[object_name], object_name)

    def metaget(
        self, text: str, default: Any = None, warn: bool = True
    ) -> Any:
        """
        get the first value from this object's metadata whose key exactly
        matches `text`, even if it is nested inside a mapping. evaluate it
        using `self.metadata.formatter`.

        Warning:
            this function's return values are memoized for performance.
            updating elements of self.metadata that have already been accessed
            with this function will not update future calls to this function.
        """
        return self.metadata.metaget(text, default, warn)

    def metaget_(self, text: str, default: Any = None) -> Any:
        """quiet-by-default version of metaget"""
        return self.metadata.metaget(text, default, False)

    def metablock(self, text: str, warn: bool = True) -> Optional[Mapping]:
        """
        get the first value from this object's metadata whose key exactly
        matches `text`, even if it is nested inside a mapping, if the value
        itself is a mapping (e.g., nested PVL block, XML 'area', etc.)
        evaluate it using self.metadata.formatter. if there is no key matching
        'text', will evaluate and return the metadata as a whole.
        WARNING: this function's return values are memoized for performance.
        updating elements of self.metadata that have already been accessed
        with this function will not update future calls to this function.
        """
        return self.metadata.metablock(text, warn)

    def metablock_(self, text: str) -> Optional[Mapping]:
        """quiet-by-default version of metablock"""
        return self.metadata.metablock(text, False)

    def get_absolute_paths(self, filename: Union[str, Path]) -> list[str]:
        """
        Construct `Path`s for a filename in all our search paths. (These are
        places we can look for that file).
        """
        return gmap(
            lambda sf: Path(*sf).absolute(),
            product(self.search_paths, listify(filename)),
            evaluator=list,
        )

    # TODO: reorganize this -- common dispatch funnel with dump_browse,
    #  split up the image-gen part of _browsify_array, something like that
    def show(
        self,
        object_name: str = None,
        scaled: bool = True,
        **browse_kwargs: Any
    ) -> Image:
        """
        Produce an Image from a data object associated with this product. A
        convenient way to quickly look at data.
        """
        if object_name is None:
            raise ValueError(
                f"please specify the name of an image object. "
                f"keys include {self.index}"
            )
        if not self[object_name].__class__.__name__ == "ndarray":
            raise TypeError("Data.show only works on array data.")
        if scaled is True:
            obj = self.get_scaled(object_name)
        else:
            obj = self[object_name]
        # no need to have all this mpl stuff in the namespace normally
        from pdr.browsify import _browsify_array

        return _browsify_array(obj, save=False, outbase="", **browse_kwargs)

    def dump_browse(
        self,
        prefix: Optional[Union[str, Path]] = None,
        outpath: Optional[Union[str, Path]] = None,
        scaled: bool = True,
        purge: bool = False,
        **browse_kwargs: Any,
    ) -> None:
        """
        attempt to dump all data objects associated with this Data object
        to disk.

        By default, writes files to the working directory.

        By default, assigns filenames like:
        {filename stem}_{object name}.{file extension}

        So, for instance, a browse version of a TABLE object referenced from
        "jn23a1.lbl" would be written to  "jn23a1_TABLE.csv".

        If prefix is not None, filenames will begin with the value of prefix
        rather than the original filename stem.

        If outpath is not None, files will be written to the value of outpath
        rather than to the working directory.

        By default, attempts to apply scaling/offset factors and special
        constant masking before writing images. If scaled is False, does not
        do that. If scaled == "both", writes both scaled and unscaled
        versions, adding "_scaled" and "_unscaled" to their respective
        filenames before the file extension. Note that some types of load
        operations (like for FITS files) may have already applied scaling
        factors, in which case recovering the unscaled image is not possible.

        if purge is True, objects are deleted as soon as they are dumped,
        rendering this Data object 'empty' afterward.

        **browse_kwargs are passed directly to browsify.browsify(), and
        offer various ways to modify image dumping behavior:

        - image_clip: Union[float, tuple[float, float], None] = None
            Applies a percentile clip to the image at
            clip = (low_percentile, 100-high_percentile).
            If clip is a single value, low_percentile=high_percentile
            in the above formula. If it's a tuple, low_percentile is
            the first value in the tuple.

            The default None value causes 'nice' clipping: it clips the image
            at (1, 1), but if this results in the clipped image containing only
            a single value, it uses the original image instead. Pass 0 if
            absolutely no clipping is desired.

        - mask_color: Optional[tuple[int, int, int]] = (0, 255, 255)
            Allows specification of RGB color for masked arrays (default cyan)

        - band_ix: Optional[int] = None
            The index of the band to be exported in a multiband image. If None,
            the middle band of the image is exported. If there are 3-4 bands in
            the image and the override_rgba argument is False, this value is
            ignored.

            When set equal to "burst", returns a separate browse product for
            each band of a multiband image, appending numbers to the filenames
            prior to the file extension.

        - save: bool = True
            If False, renders images in memory but does not save them to disk.
            Not generally useful when passed to this method except for testing.

        - override_rgba: bool = False
            Allows use of band_ix when there are 3-4 bands in the image.
            Otherwise, the image will be returned as a stacked rgb image
            (the assumed 'alpha' channel is always dropped). Setting this to
            True is useful when a 3/4 band image is not actually RGB(A) (e.g.
            XYZ spatial products).

            This argument has no effect on images that do not have 3-4 bands.

        - image_format: str = "jpg"
            Sets image extension which informs the format pillow will save the
            browse image as.

        - slice_axis: int = 0
            Allows specification of which axis to slice along for the
            dump_browse image. The default slices at axis 0 (which is usually
            the axis labelled "BAND").

        - rgb_channels: Optional[tuple[int, int, int]] = None
            Allows specification of the bands used to create an RGB image. By 
            default the first three bands of a 3-4 band image are used for the 
            red, green, and blue channels respectively (equivalent to manually 
            specifying rgb_channels=(0,1,2)).

            If this argument is used, band_ix and override_rgba are ignored. It 
            can also be used on multiband images with >4 bands to output an RGB 
            image.

        """
        if prefix is None:
            prefix = Path(self.filename).stem
        if outpath is None:
            outpath = Path(".")
        for obj in filter(lambda i: i in dir(self), self.index):
            outfile = str(Path(outpath, f"{prefix}_{obj}"))
            # no need to have all this mpl stuff in the namespace normally
            from pdr.browsify import browsify

            dump_it = partial(browsify, purge=purge, **browse_kwargs)
            fdt = browse_kwargs.get("float_dtype")
            if (
                self[obj].__class__.__name__ == "ndarray"
                and len(self[obj].shape) != 1
            ):
                if looks_like_this_kind_of_file(self.filename, FITS_EXTENSIONS) \
                        and (scaled == "both" or scaled is False):
                    warnings.warn("Scaling for FITS files cannot be turned "
                                  "off, dumping scaled products.")
                    dump_it(self[obj], outfile + "_scaled")
                elif scaled == "both":
                    dump_it(
                        self.get_scaled(obj, float_dtype=fdt),
                        outfile + "_scaled",
                        purge=False,
                    )
                    dump_it(self[obj], outfile + "_unscaled")
                elif scaled is True:
                    dump_it(
                        self.get_scaled(obj, inplace=purge, float_dtype=fdt),
                        outfile,
                    )
                elif scaled is False:
                    dump_it(self[obj], outfile)
                else:
                    raise ValueError(f"unknown scaling argument {scaled}")
            else:
                dump_it(self[obj], outfile)
            if purge is True:
                self.__delattr__(obj)

    def __getattribute__(self, attr: str) -> Any:
        """
        Get an attribute of self; known data objects can be referred to
        using attribute notation.
        """
        try:
            return super().__getattribute__(attr)
        except AttributeError:
            if attr not in self.index:
                raise
        self.load(attr)
        return super().__getattribute__(attr)

    # This method exists as a bypass for the special behavior of
    # __getattribute__.  All code reachable from load() must take
    # care when accessing attributes of self in order to avoid an
    # infinite lazy-load loop; this makes that more convenient.
    def getattr(self, attr):
        """
        get an attribute of self without either lazy-loading on failure or
        risking infinite loops inside lazy-load behaviors.
        """
        return super().__getattribute__(attr)

    # The following three functions make this object act sort of dict-like
    #  in useful ways for data exploration.
    def keys(self) -> list[str]:
        """
        Returns names of all data objects defined in the label (or inferred
        while loading an object, like FITS headers).
        """
        return self.index

    def __contains__(self, name: str) -> bool:
        """True if self contains a data object with the name 'name'."""
        return name in self.index

    # make it possible to get data objects with slice notation, like a dict
    def __getitem__(self, name: str) -> Any:
        """
        Return the contained data object with the name 'name'.
        """
        if name not in self.index:
            warnings.warn("in a future release Data[name] will accept only"
                          " names of data objects, not other properties",
                          DeprecationWarning, stacklevel=1)
        return self.__getattribute__(name)

    def __repr__(self):
        """"""
        rep = f"pdr.Data({self.filename})\nkeys={self.keys()}"
        if len(self.unloaded()) > 0:
            rep += f"\nnot yet loaded: {self.unloaded()}"
        return rep

    def __str__(self):
        """"""
        return self.__repr__()

    def __len__(self):
        """
        Return the number of data objects contained in self.
        """
        return len(self.index)

    def __iter__(self) -> Iterator[Any]:
        """
        Iterate over all the data objects contained in self.
        Iteration all the way to the end will cause all of the data
        objects to be loaded, which may run your computer out of memory.
        For this reason, iteration over Data objects is deprecated
        and will be removed in six months.
        """
        warnings.warn("iteration over Data objects is deprecated"
                      " as it can crash your computer",
                      DeprecationWarning, stacklevel=1)
        for key in self.keys():
            yield self[key]

    _metaget_interior: Callable[[str, Any], Any]
    _metablock_interior: Callable[[str], Mapping]

__contains__(name: str) -> bool

True if self contains a data object with the name 'name'.

Source code in pdr/pdr.py
1161
1162
1163
def __contains__(self, name: str) -> bool:
    """True if self contains a data object with the name 'name'."""
    return name in self.index

__getattribute__(attr: str) -> Any

Get an attribute of self; known data objects can be referred to using attribute notation.

Source code in pdr/pdr.py
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
def __getattribute__(self, attr: str) -> Any:
    """
    Get an attribute of self; known data objects can be referred to
    using attribute notation.
    """
    try:
        return super().__getattribute__(attr)
    except AttributeError:
        if attr not in self.index:
            raise
    self.load(attr)
    return super().__getattribute__(attr)

__getitem__(name: str) -> Any

Return the contained data object with the name 'name'.

Source code in pdr/pdr.py
1166
1167
1168
1169
1170
1171
1172
1173
1174
def __getitem__(self, name: str) -> Any:
    """
    Return the contained data object with the name 'name'.
    """
    if name not in self.index:
        warnings.warn("in a future release Data[name] will accept only"
                      " names of data objects, not other properties",
                      DeprecationWarning, stacklevel=1)
    return self.__getattribute__(name)

__init__(fn: Union[Path, str], *, debug: bool = False, label_fn: Optional[Union[Path, str]] = None, search_paths: Union[Collection[str], str] = (), skip_existence_check: bool = False, pvl_limit: int = DEFAULT_PVL_LIMIT, tracker: Optional[TrivialTracker] = None, strict_label_decode: bool = True)

Source code in pdr/pdr.py
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
def __init__(
    self,
    fn: Union[Path, str],
    *,
    debug: bool = False,
    label_fn: Optional[Union[Path, str]] = None,
    search_paths: Union[Collection[str], str] = (),
    skip_existence_check: bool = False,
    pvl_limit: int = DEFAULT_PVL_LIMIT,
    tracker: Optional[TrivialTracker] = None,
    strict_label_decode: bool = True
):
    """"""
    # Bail out early if someone's trying to load directly from the network.
    if isinstance(fn, str) and re.match("(?i)^(?:https?|ftp):", fn):
        raise ValueError("Read-from-url is not currently implemented.")

    # list of the product's associated data objects
    self.index = []
    # do we raise an exception rather than a warning if loading a data
    # object fails?
    self.debug = debug
    self.filename = check_cases(Path(fn).absolute(), skip_existence_check)
    self.loaders = {}
    if (self.debug is True) and (tracker is None):
        self.tracker = Tracker(
            Path(self.filename).name.replace(".", "_"),
            outdir=Path(__file__).parent / ".tracker_logs",
        )
        self.tracker.clear()
    elif tracker is None:
        self.tracker = TrivialTracker()
    else:
        self.tracker = tracker
    # mappings from data objects to local paths
    self.file_mapping = {}
    # known special constants per data object
    self.specials = {}
    # dict to flag images loaded prescaled (currently only from FITS files)
    self._scaleflags = {}
    # where can we look for files containing data objects?
    # not yet fully implemented; only uses first (automatic) one.
    self.search_paths = [self._init_search_path()] + listify(search_paths)
    self.standard = None
    # cache for pds4_tools.reader.general_objects.Structure objects.
    self._pds4_structures = None
    # cache for hdulist, for primary FITS files -- this is primarily
    # an optimization for compressed files
    self._hdulist = None
    # dict of [str, int] for HDU name / position in primary FITS files,
    # for reindexing duplicates and efficiency
    self._hdumap = None
    # data structure recording interleaved PDS3 objects not defined at top
    # level, intended for things like axplanes and line prefix tables
    self._interleaved_objects = {}
    # Attempt to identify and assign a label file
    self.labelname = associate_label_file(
        self.filename, label_fn, skip_existence_check
    )
    # if unlabeled, check to see if we can read it in a non-PDS format
    if self.labelname is None:
        primary_format = check_primary_fmt(self.filename)
    elif (fmt := check_primary_fmt(self.labelname)) is not None:
        primary_format = fmt
    else:
        primary_format = None
    if primary_format is not None:
        self.standard = primary_format
        if self.standard == "FITS":
            from astropy.io import fits

            # TODO: bad. need to not leave this open, although inefficient
            self._hdulist = fits.open(self.filename)
    elif (
        str(self.labelname).endswith(".xml") 
        or str(self.labelname).endswith(".lblx")
        or ("CE" in str(self.labelname) and
            str(self.labelname).endswith((".2BL", ".2AL", ".2CL", ".01L")))
    ):
        self.standard = "PDS4"
        self._pds4_structures = {}
        self._init_pds4()
    else:
        self.standard = "PDS3"
    try:
        self.metadata = self.read_metadata(
            pvl_limit=pvl_limit, strict_decode=strict_label_decode
        )
    except (UnicodeError, FileNotFoundError) as ex:
        raise ValueError(
            f"Can't load this product's metadata: {ex}, {type(ex)}"
        )
    self.load_metadata_changes()
    if self.standard == "PDS4":
        return
    if primary_format is not None:
        self._init_primary_format()
        return
    self.identifiers = self.metadata.identifiers
    self.pointers = get_pds3_pointers(self.metadata)
    # if self.pointers is None, we've probably got a weird edge case where
    # someone directly opened a PVL file that's not an individual product
    # label (e.g. a format file or a non-PDS PVL file) -- but there's no
    # reason to not allow them to use PDR as a PVL parser.
    if self.pointers is not None:
        self._find_objects()

__iter__() -> Iterator[Any]

Iterate over all the data objects contained in self. Iteration all the way to the end will cause all of the data objects to be loaded, which may run your computer out of memory. For this reason, iteration over Data objects is deprecated and will be removed in six months.

Source code in pdr/pdr.py
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
def __iter__(self) -> Iterator[Any]:
    """
    Iterate over all the data objects contained in self.
    Iteration all the way to the end will cause all of the data
    objects to be loaded, which may run your computer out of memory.
    For this reason, iteration over Data objects is deprecated
    and will be removed in six months.
    """
    warnings.warn("iteration over Data objects is deprecated"
                  " as it can crash your computer",
                  DeprecationWarning, stacklevel=1)
    for key in self.keys():
        yield self[key]

__len__()

Return the number of data objects contained in self.

Source code in pdr/pdr.py
1187
1188
1189
1190
1191
def __len__(self):
    """
    Return the number of data objects contained in self.
    """
    return len(self.index)

__repr__()

Source code in pdr/pdr.py
1176
1177
1178
1179
1180
1181
def __repr__(self):
    """"""
    rep = f"pdr.Data({self.filename})\nkeys={self.keys()}"
    if len(self.unloaded()) > 0:
        rep += f"\nnot yet loaded: {self.unloaded()}"
    return rep

__str__()

Source code in pdr/pdr.py
1183
1184
1185
def __str__(self):
    """"""
    return self.__repr__()

_add_loaded_objects(obj: Mapping[str, Any])

Helper for load(). Ingests objects returned by a Loader.

Source code in pdr/pdr.py
602
603
604
605
606
607
608
def _add_loaded_objects(self, obj: Mapping[str, Any]):
    """Helper for `load()`. Ingests objects returned by a `Loader`."""
    for k, v in obj.items():
        if v is not None:
            setattr(self, k, v)
            if k not in self.index:
                self.index.append(k)

_associate_prefix_tables(imname, preobjs)

Check for underspecified line prefix table objects associated with a PDS3 image specification.

Source code in pdr/pdr.py
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
def _associate_prefix_tables(self, imname, preobjs):
    """
    Check for underspecified line prefix table objects associated with a
    PDS3 image specification.
    """
    block = self.metablock_(imname)
    # TODO, maybe: do a special case check against the name. This is only
    #  important if we ever have a special object name case that is
    #  relevant. (currently we do not.)
    if block is None:
        return
    if len(spointers := tuple(filter(STRUCTUREPAT.match, block))) == 0:
        return
    elif len(spointers) > 1:
        # hopefully this never happens. we will handle it if so
        raise NotImplementedError(
            "Multiple implicitly-defined line prefix tables within a "
            "single object are not supported."
        )
    # check for a matching prefix table defined at top level with no
    # explicit block
    if len(preobjs) > 0:
        from pdr.loaders.queries import standalone_start_byte

        im_fn_byte = standalone_start_byte(self, imname)
        found = False
        for pre in preobjs:
            if standalone_start_byte(self, pre) == im_fn_byte:
                self._interleaved_objects[pre] = {
                    'parent': imname, 'type': 'line_prefix_table'
                }
                found = True
        if found is True:
            return
    # TODO: unclear if this should be a special case or a general fix. Need 
    # more examples to know for sure; Galileo SSI-4-REDR IMAGE objects are 
    # the current known example.
    # For cases when ^LINE_PREFIX_STRUCTURE = "" 
    if (
        "^LINE_PREFIX_STRUCTURE" in block 
        and block["^LINE_PREFIX_STRUCTURE"] == ""
    ):
        return
    # TODO, maybe: technically this could be a line suffix table,
    #  although we have never found them. We could add a check.
    fixname = f'{imname}_LINE_PREFIX_TABLE'
    self._interleaved_objects[fixname] = {
        'parent': imname, 'type': 'line_prefix_table'
    }
    self.index.append(fixname)

_check_compressed_file_pointer(object_name: str) -> tuple[bool, Optional[tuple[Path, ...]]]

When PDS3 labels describe data objects in compressed files, they often give the names that the compressed files would have, were someone to decompress them, as the physical locations of those objects. This can be confusing, because you cannot load an object from a merely hypothetical file.

However, this is by no means a strict convention, so we can't just assume that it's the case -- we have to check all the file names mentioned for that object in the label, including those not given as top-level pointers.

Source code in pdr/pdr.py
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
def _check_compressed_file_pointer(
    self, object_name: str
) -> tuple[bool, Optional[tuple[Path, ...]]]:
    """
    When PDS3 labels describe data objects in compressed files, they often
    give the names that the compressed files _would_ have, were someone to
    decompress them, as the physical locations of those objects. This can
    be confusing, because you cannot load an object from a merely
    hypothetical file.

    However, this is by no means a strict convention, so we can't just
    assume that it's the case -- we have to check all the file names
    mentioned for that object in the label, including those not given as
    top-level pointers.
    """
    compkeys = {"COMPRESSED_FILE", "UNCOMPRESSED_FILE"}
    if (
        len(compkeys.intersection(self.metadata.keys())) == 2
        and object_name in self.metablock_("UNCOMPRESSED_FILE").keys()
    ):
        blocks = filter(None, [self.metaget_(k) for k in compkeys])
        filenames = filter(None, [b.get("FILE_NAME") for b in blocks])
        return True, tuple(
            chain.from_iterable(map(self.get_absolute_paths, filenames))
        )
    return False, None

_file_not_found(object_name: str)

Implements default file-not-found behavior.

Source code in pdr/pdr.py
622
623
624
625
626
627
628
629
630
631
632
def _file_not_found(self, object_name: str):
    """Implements default file-not-found behavior."""
    message = (
        f"{object_name} file {self._object_to_filename(object_name)} "
        f"not found in path."
    )
    if self.debug:
        raise FileNotFoundError(message)
    else:
        warnings.warn(message)
    setattr(self, object_name, self.metaget_(object_name))

_find_fits_header_pds4_id(start_byte: int) -> Optional[str]

Given start byte for an HDU's data segment, check to see if the PDS4 product associated with self includes that HDU's header as a distinct data object with a local identifier. If it is, return the PDS4 local identifier of that object. If not, return None.

Source code in pdr/pdr.py
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
def _find_fits_header_pds4_id(self, start_byte: int) -> Optional[str]:
    """
    Given start byte for an HDU's data segment, check to see if the
    PDS4 product associated with self includes that HDU's header as a
    distinct data object with a local identifier. If it is, return the
    PDS4 local identifier of that object. If not, return None.
    """
    from pdr.pds4_tools.reader.header_objects import HeaderStructure

    for k, v in self._pds4_structures.items():
        if not isinstance(v, HeaderStructure):
            # we only require header objects to have the
            # 'object_length' key. label objects also do not
            # have meta_data
            continue
        meta = v.meta_data
        if 'object_length' not in meta.keys():
            continue
        if meta['offset'] + meta['object_length'] == start_byte:
            if 'name' not in meta.keys():
                return None
            return meta['name'].replace(' ', '_')

    return None

_find_objects()

Add all top-level data objects mentioned in the label to this object's index, except for 'trivial' ones (see loaders.utility.is_trivial()). Also check for interleaved objects not defined at top level (such as some line prefix tables).

TODO: check for ISIS-style axplane objects.

Source code in pdr/pdr.py
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
def _find_objects(self):
    """
    Add all top-level data objects mentioned in the label to this object's
    index, except for 'trivial' ones (see `loaders.utility.is_trivial()`).
    Also check for interleaved objects not defined at top level (such as
    some line prefix tables).

    TODO: check for ISIS-style axplane objects.
    """
    from pdr.loaders.utility import is_trivial
    from pdr.formats.checkers import check_special_objects

    is_special, special_objects = check_special_objects(self.identifiers)
    if is_special is True:
        self.index += special_objects
        return

    # TODO: make this not add objects again if called multiple times
    for pointer in self.pointers:
        imname = depointerize(pointer)
        if is_trivial(imname):
            continue
        self.index.append(imname)
    # check for poorly / implicitly-defined line prefix tables
    # top-level line prefix objects with no blocks of their own
    preobjs = [
        n for n in self.index
        if "LINE_PREFIX" in n and self.metablock_(n) is None
    ]
    for imname in tuple(
        # greedily consuming the filter so that we can mutate self.index
        filter(lambda n: "IMAGE" in n and "TABLE" not in n, self.index)
    ):
        self._associate_prefix_tables(imname, preobjs)

_init_pds4()

use pds4_tools to open pds4 files, but in our interface idiom.

Source code in pdr/pdr.py
335
336
337
338
339
340
341
342
343
344
345
346
347
def _init_pds4(self):
    """use pds4_tools to open pds4 files, but in our interface idiom."""

    import pdr.pds4_tools as pds4

    structure_list = pds4.read(
        self.labelname, lazy_load=True, quiet=True, no_scale=True
    )
    for structure in structure_list.structures:
        self._pds4_structures[structure.id.replace(" ", "_")] = structure
        self.index.append(structure.id.replace(" ", "_"))
    self._pds4_structures["label"] = structure_list.label
    self.index.append("label")

_init_primary_format()

Initialization handler for "primary" format modes (cases in which Data offers an interface to a file or files in a standard format). Currently only supports FITS and 'desktop' image formats.

Source code in pdr/pdr.py
651
652
653
654
655
656
657
658
659
660
661
662
663
def _init_primary_format(self):
    """
    Initialization handler for "primary" format modes (cases in which
    `Data` offers an interface to a file or files in a standard format).
    Currently only supports FITS and 'desktop' image formats.
    """
    if self.standard == "FITS":
        for k in self.metadata.keys():
            self.index.append(k)
        return
    elif self.standard in DESKTOP_IMAGE_STANDARDS:
        return self._add_compressed_image_objects()
    raise NotImplementedError(f"unrecognized standard {self.standard}")

_init_search_path() -> str

Set initial path this object will check for additional files (just the directory that contains its "primary" file).

Source code in pdr/pdr.py
349
350
351
352
353
354
355
356
357
def _init_search_path(self) -> str:
    """
    Set initial path this object will check for additional files (just the
    directory that contains its "primary" file).
    """
    for target in ("labelname", "filename"):
        if (target in dir(self)) and (target is not None):
            return str(Path(self.getattr(target)).absolute().parent)
    raise FileNotFoundError

_load_pds4(object_name: str)

Load this object however pds4_tools wants to load this object, then reformat to DataFrame, expose the array handle in accordance with our type conventions, etc.

If the object is from a FITS file, preempt all that behavior and send it to our internal FITS-loading workflow.

Source code in pdr/pdr.py
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
def _load_pds4(self, object_name: str):
    """
    Load this object however pds4_tools wants to load this object, then
    reformat to DataFrame, expose the array handle in accordance with our
    type conventions, etc.

    If the object is from a FITS file, preempt all that behavior and send
    it to our internal FITS-loading workflow.
    """
    structure = self._pds4_structures[object_name]
    from pdr.pds4_tools.reader.label_objects import Label

    if isinstance(structure, Label):
        setattr(self, "label", structure)
    elif check_special_pds4_cases(structure, self.filename,
                                  object_name) is not None:
        result = check_special_pds4_cases(structure, self.filename,
                                          object_name)
        setattr(self, object_name, result)
    elif check_primary_fmt(structure.parent_filename) == "FITS":
        from pdr.loaders.handlers import handle_fits_file

        offset = structure.meta_data['offset']
        result = handle_fits_file(
            structure.parent_filename, object_name, offset
        )
        if structure.is_header() is True:
            return self._add_loaded_objects(result)
        if (hid := self._find_fits_header_pds4_id(offset)) is not None:
            result[hid] = result.pop(f"{object_name}_HEADER")
        if structure.is_array() is True:
            self._scaleflags[object_name] = True
        self._add_loaded_objects(result)
    elif structure.is_array():
        import numpy as np

        setattr(self, object_name, np.asarray(structure.data))
    elif structure.is_table():
        from pdr.pd_utils import structured_array_to_df

        df = structured_array_to_df(structure.data)
        df.columns = df.columns.str.replace(r"GROUP_?\d+", "", regex=True)
        df.columns = df.columns.str.strip(", ")
        setattr(self, object_name, df)
    # TODO: do other important cases exist?
    else:
        setattr(self, object_name, structure.data)

_load_primary_fits(object_name: str) -> Union[np.ndarray, pd.DataFrame, None]

Handle loading an HDU from a FITS file in "primary" FITS mode.

Source code in pdr/pdr.py
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
def _load_primary_fits(
    self, object_name: str
) -> Union[np.ndarray, pd.DataFrame, None]:
    """Handle loading an HDU from a FITS file in "primary" FITS mode."""
    from pdr.loaders.handlers import handle_fits_file

    obj = handle_fits_file(
        self.filename,
        object_name,
        self._hdumap[object_name],
        self._hdulist,
        hdu_id_is_index=True
    )
    if obj.__class__.__name__ == "ndarray":
        self._scaleflags[object_name] = True
    return obj

_object_to_filename(object_name: str) -> Union[str, list[str], Optional[tuple[Path, ...]]]

Construct one or more on-disk search paths for the file that contains a named data object. Does not actually check if files exist at those paths (typically performed by calls to `utils.check_cases()).

Source code in pdr/pdr.py
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
def _object_to_filename(
    self, object_name: str
) -> Union[str, list[str], Optional[tuple[Path, ...]]]:
    """
    Construct one or more on-disk search paths for the file that contains
    a named data object. Does not actually check if files exist at those
    paths (typically performed by calls to `utils.check_cases()).
    """
    is_special, special_target = check_special_fn(
        self, object_name, self.identifiers
    )
    if is_special is True:
        return self.get_absolute_paths(special_target)
    is_comp, comp_paths = self._check_compressed_file_pointer(object_name)
    if is_comp is True:
        return comp_paths
    target = self.metaget_(pointerize(object_name))
    if isinstance(target, Sequence) and not (isinstance(target, str)):
        if isinstance(target[0], str):
            target = target[0]
    if isinstance(target, str):
        return self.get_absolute_paths(target)
    else:
        return self.filename

_target_path(object_name: str, cached: bool = True, raise_missing: bool = False) -> Optional[Union[Path, list[Path], str]]

Considering all known search paths and treating filenames as case-insensitive, attempt to find a filesystem path to a file or files in which a particular named data object might exist. This autopopulates self.file_mapping[object_name] if it finds one or more files, and by default treats this value as cached on subsequent calls (which can improve performance significantly, especially on networked filesystems).

Source code in pdr/pdr.py
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
def _target_path(
    self,
    object_name: str,
    cached: bool = True,
    raise_missing: bool = False
) -> Optional[Union[Path, list[Path], str]]:
    """
    Considering all known search paths and treating filenames as
    case-insensitive, attempt to find a filesystem path to a
    file or files in which a particular named data object might exist.
    This autopopulates self.file_mapping[object_name] if it finds one or
    more files, and by default treats this value as cached on subsequent
    calls (which can improve performance significantly, especially on
    networked filesystems).
    """
    if cached is True and (self.file_mapping.get(object_name) is not None):
        return self.file_mapping[object_name]
    try:
        if isinstance(object_name, set):
            file_list = [
                self._target_path(obj, cached=cached, raise_missing=raise_missing)
                for obj in object_name
            ]
            self.file_mapping[object_name] = file_list
            return file_list
        path = check_cases(self._object_to_filename(object_name))
        self.file_mapping[object_name] = path
        return path
    except FileNotFoundError:
        if raise_missing is True:
            raise
        return None

dump_browse(prefix: Optional[Union[str, Path]] = None, outpath: Optional[Union[str, Path]] = None, scaled: bool = True, purge: bool = False, **browse_kwargs: Any) -> None

attempt to dump all data objects associated with this Data object to disk.

By default, writes files to the working directory.

By default, assigns filenames like: {filename stem}_{object name}.{file extension}

So, for instance, a browse version of a TABLE object referenced from "jn23a1.lbl" would be written to "jn23a1_TABLE.csv".

If prefix is not None, filenames will begin with the value of prefix rather than the original filename stem.

If outpath is not None, files will be written to the value of outpath rather than to the working directory.

By default, attempts to apply scaling/offset factors and special constant masking before writing images. If scaled is False, does not do that. If scaled == "both", writes both scaled and unscaled versions, adding "_scaled" and "_unscaled" to their respective filenames before the file extension. Note that some types of load operations (like for FITS files) may have already applied scaling factors, in which case recovering the unscaled image is not possible.

if purge is True, objects are deleted as soon as they are dumped, rendering this Data object 'empty' afterward.

**browse_kwargs are passed directly to browsify.browsify(), and offer various ways to modify image dumping behavior:

  • image_clip: Union[float, tuple[float, float], None] = None Applies a percentile clip to the image at clip = (low_percentile, 100-high_percentile). If clip is a single value, low_percentile=high_percentile in the above formula. If it's a tuple, low_percentile is the first value in the tuple.

    The default None value causes 'nice' clipping: it clips the image at (1, 1), but if this results in the clipped image containing only a single value, it uses the original image instead. Pass 0 if absolutely no clipping is desired.

  • mask_color: Optional[tuple[int, int, int]] = (0, 255, 255) Allows specification of RGB color for masked arrays (default cyan)

  • band_ix: Optional[int] = None The index of the band to be exported in a multiband image. If None, the middle band of the image is exported. If there are 3-4 bands in the image and the override_rgba argument is False, this value is ignored.

    When set equal to "burst", returns a separate browse product for each band of a multiband image, appending numbers to the filenames prior to the file extension.

  • save: bool = True If False, renders images in memory but does not save them to disk. Not generally useful when passed to this method except for testing.

  • override_rgba: bool = False Allows use of band_ix when there are 3-4 bands in the image. Otherwise, the image will be returned as a stacked rgb image (the assumed 'alpha' channel is always dropped). Setting this to True is useful when a 3/4 band image is not actually RGB(A) (e.g. XYZ spatial products).

    This argument has no effect on images that do not have 3-4 bands.

  • image_format: str = "jpg" Sets image extension which informs the format pillow will save the browse image as.

  • slice_axis: int = 0 Allows specification of which axis to slice along for the dump_browse image. The default slices at axis 0 (which is usually the axis labelled "BAND").

  • rgb_channels: Optional[tuple[int, int, int]] = None Allows specification of the bands used to create an RGB image. By default the first three bands of a 3-4 band image are used for the red, green, and blue channels respectively (equivalent to manually specifying rgb_channels=(0,1,2)).

    If this argument is used, band_ix and override_rgba are ignored. It can also be used on multiband images with >4 bands to output an RGB image.

Source code in pdr/pdr.py
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
def dump_browse(
    self,
    prefix: Optional[Union[str, Path]] = None,
    outpath: Optional[Union[str, Path]] = None,
    scaled: bool = True,
    purge: bool = False,
    **browse_kwargs: Any,
) -> None:
    """
    attempt to dump all data objects associated with this Data object
    to disk.

    By default, writes files to the working directory.

    By default, assigns filenames like:
    {filename stem}_{object name}.{file extension}

    So, for instance, a browse version of a TABLE object referenced from
    "jn23a1.lbl" would be written to  "jn23a1_TABLE.csv".

    If prefix is not None, filenames will begin with the value of prefix
    rather than the original filename stem.

    If outpath is not None, files will be written to the value of outpath
    rather than to the working directory.

    By default, attempts to apply scaling/offset factors and special
    constant masking before writing images. If scaled is False, does not
    do that. If scaled == "both", writes both scaled and unscaled
    versions, adding "_scaled" and "_unscaled" to their respective
    filenames before the file extension. Note that some types of load
    operations (like for FITS files) may have already applied scaling
    factors, in which case recovering the unscaled image is not possible.

    if purge is True, objects are deleted as soon as they are dumped,
    rendering this Data object 'empty' afterward.

    **browse_kwargs are passed directly to browsify.browsify(), and
    offer various ways to modify image dumping behavior:

    - image_clip: Union[float, tuple[float, float], None] = None
        Applies a percentile clip to the image at
        clip = (low_percentile, 100-high_percentile).
        If clip is a single value, low_percentile=high_percentile
        in the above formula. If it's a tuple, low_percentile is
        the first value in the tuple.

        The default None value causes 'nice' clipping: it clips the image
        at (1, 1), but if this results in the clipped image containing only
        a single value, it uses the original image instead. Pass 0 if
        absolutely no clipping is desired.

    - mask_color: Optional[tuple[int, int, int]] = (0, 255, 255)
        Allows specification of RGB color for masked arrays (default cyan)

    - band_ix: Optional[int] = None
        The index of the band to be exported in a multiband image. If None,
        the middle band of the image is exported. If there are 3-4 bands in
        the image and the override_rgba argument is False, this value is
        ignored.

        When set equal to "burst", returns a separate browse product for
        each band of a multiband image, appending numbers to the filenames
        prior to the file extension.

    - save: bool = True
        If False, renders images in memory but does not save them to disk.
        Not generally useful when passed to this method except for testing.

    - override_rgba: bool = False
        Allows use of band_ix when there are 3-4 bands in the image.
        Otherwise, the image will be returned as a stacked rgb image
        (the assumed 'alpha' channel is always dropped). Setting this to
        True is useful when a 3/4 band image is not actually RGB(A) (e.g.
        XYZ spatial products).

        This argument has no effect on images that do not have 3-4 bands.

    - image_format: str = "jpg"
        Sets image extension which informs the format pillow will save the
        browse image as.

    - slice_axis: int = 0
        Allows specification of which axis to slice along for the
        dump_browse image. The default slices at axis 0 (which is usually
        the axis labelled "BAND").

    - rgb_channels: Optional[tuple[int, int, int]] = None
        Allows specification of the bands used to create an RGB image. By 
        default the first three bands of a 3-4 band image are used for the 
        red, green, and blue channels respectively (equivalent to manually 
        specifying rgb_channels=(0,1,2)).

        If this argument is used, band_ix and override_rgba are ignored. It 
        can also be used on multiband images with >4 bands to output an RGB 
        image.

    """
    if prefix is None:
        prefix = Path(self.filename).stem
    if outpath is None:
        outpath = Path(".")
    for obj in filter(lambda i: i in dir(self), self.index):
        outfile = str(Path(outpath, f"{prefix}_{obj}"))
        # no need to have all this mpl stuff in the namespace normally
        from pdr.browsify import browsify

        dump_it = partial(browsify, purge=purge, **browse_kwargs)
        fdt = browse_kwargs.get("float_dtype")
        if (
            self[obj].__class__.__name__ == "ndarray"
            and len(self[obj].shape) != 1
        ):
            if looks_like_this_kind_of_file(self.filename, FITS_EXTENSIONS) \
                    and (scaled == "both" or scaled is False):
                warnings.warn("Scaling for FITS files cannot be turned "
                              "off, dumping scaled products.")
                dump_it(self[obj], outfile + "_scaled")
            elif scaled == "both":
                dump_it(
                    self.get_scaled(obj, float_dtype=fdt),
                    outfile + "_scaled",
                    purge=False,
                )
                dump_it(self[obj], outfile + "_unscaled")
            elif scaled is True:
                dump_it(
                    self.get_scaled(obj, inplace=purge, float_dtype=fdt),
                    outfile,
                )
            elif scaled is False:
                dump_it(self[obj], outfile)
            else:
                raise ValueError(f"unknown scaling argument {scaled}")
        else:
            dump_it(self[obj], outfile)
        if purge is True:
            self.__delattr__(obj)

find_special_constants(object_name: str) -> dict[str, Number]

look up or infer special constants for one of our data objects. in general, only works well on ndarrays.

Source code in pdr/pdr.py
902
903
904
905
906
907
908
909
910
911
912
def find_special_constants(self, object_name: str) -> dict[str, Number]:
    """
    look up or infer special constants for one of our data objects.
    in general, only works well on ndarrays.
    """
    if len(consts := special_image_constants(self.identifiers)) > 0:
        return consts

    from pdr._scaling import find_special_constants

    return find_special_constants(self, self[object_name], object_name)

get_absolute_paths(filename: Union[str, Path]) -> list[str]

Construct Paths for a filename in all our search paths. (These are places we can look for that file).

Source code in pdr/pdr.py
950
951
952
953
954
955
956
957
958
959
def get_absolute_paths(self, filename: Union[str, Path]) -> list[str]:
    """
    Construct `Path`s for a filename in all our search paths. (These are
    places we can look for that file).
    """
    return gmap(
        lambda sf: Path(*sf).absolute(),
        product(self.search_paths, listify(filename)),
        evaluator=list,
    )

get_scaled(object_name: str, inplace: bool = False, float_dtype: Optional[np.dtype] = None) -> np.ndarray

fetches copy of data object corresponding to key, masks special constants, then applies any scale and offset specified in the label. only relevant to arrays.

if inplace is True, does calculations in-place on original array, with attendant memory savings and destructiveness.

Source code in pdr/pdr.py
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
def get_scaled(
    self,
    object_name: str,
    inplace: bool = False,
    float_dtype: Optional[np.dtype] = None
) -> np.ndarray:
    """
    fetches copy of data object corresponding to key, masks special
    constants, then applies any scale and offset specified in the label.
    only relevant to arrays.

    if `inplace` is True, does calculations in-place on original array,
    with attendant memory savings and destructiveness.
    """
    obj = self[object_name]
    # avoid numpy import just for type check
    if obj.__class__.__name__ != "ndarray":
        raise TypeError("get_scaled is only applicable to arrays.")
    if self._scaleflags.get(object_name) is True:
        return obj
    if self.standard == "PDS4":
        from pdr._scaling import scale_pds4_tools_struct

        # Do whatever pds4_tools would most likely do with these data.
        # TODO: shake this out much more vigorously. perhaps make
        #  the inplace and float_dtype arguments do something.
        #  check and see if implicit special constants ever still exist
        #  stealthily in PDS4 data. etc.
        return scale_pds4_tools_struct(self._pds4_structures[object_name])

    # TODO: double-check that astropy is successfully handling masking
    # TODO: most 'desktop' image formats should never contain special
    #  constants, but some (e.g. JP2) may be able to? check.
    if self.standard != "PDS3":
        return obj

    from pdr._scaling import mask_specials, scale_array

    if object_name not in self.specials:
        self.specials[object_name] = self.find_special_constants(
            object_name
        )
    if self.specials[object_name] != {}:
        obj = mask_specials(obj, list(self.specials[object_name].values()))
    return scale_array(self, obj, object_name, inplace, float_dtype)

getattr(attr)

get an attribute of self without either lazy-loading on failure or risking infinite loops inside lazy-load behaviors.

Source code in pdr/pdr.py
1145
1146
1147
1148
1149
1150
def getattr(self, attr):
    """
    get an attribute of self without either lazy-loading on failure or
    risking infinite loops inside lazy-load behaviors.
    """
    return super().__getattribute__(attr)

keys() -> list[str]

Returns names of all data objects defined in the label (or inferred while loading an object, like FITS headers).

Source code in pdr/pdr.py
1154
1155
1156
1157
1158
1159
def keys(self) -> list[str]:
    """
    Returns names of all data objects defined in the label (or inferred
    while loading an object, like FITS headers).
    """
    return self.index

load(name: str, reload: bool = False, **load_kwargs: Any)

Explicitly load an identified data object by name; alternatively name="all" means "load every identified object". Does not return the object; just assigns it to the name attribute of self. The Data.__getitem__() interface lazy-loads by calling this function with default arguments in response to data['NOTYETLOADED'] etc.

Source code in pdr/pdr.py
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
def load(self, name: str, reload: bool = False, **load_kwargs: Any):
    """
    Explicitly load an identified data object by name; alternatively
    `name="all"` means "load every identified object". Does not return the
    object; just assigns it to the `name` attribute of `self`. The
    `Data.__getitem__()` interface lazy-loads by calling this function
    with default arguments in response to `data['NOTYETLOADED']` etc.
    """
    # prelude: don't try to load nonexistent keys; facilitate
    # load-everything behavior; don't reload by default
    if (name != "all") and (name not in self.index):
        raise KeyError(f"{name} not found in index: {self.index}.")
    if name == "all":
        return self.load_all()
    if (name in dir(self)) and (reload is False):
        raise AlreadyLoadedError(
            f"{name} is already loaded; pass reload=True to "
            f"force reload."
        )
    if self.standard == "PDS4":
        return self._load_pds4(name)
    if self.standard == "FITS":
        self._add_loaded_objects(self._load_primary_fits(name))
        return
    if self.standard in DESKTOP_IMAGE_STANDARDS:
        from pdr.loaders.handlers import handle_compressed_image

        if self.metaget("n_frames", 1) == 1:
            self._add_loaded_objects(
                {name: handle_compressed_image(self.filename)}
            )
            return
        # TODO: hacky!
        if self.standard == 'MPO' and name == 'IMAGE':
            seek = 0
        else:
            seek = int(name.split("_")[-1])
        self._add_loaded_objects(
            {name: handle_compressed_image(self.filename, seek)}
        )
        return
    if self.file_mapping.get(name) is None:
        target = self._target_path(name)
        if target is None:
            return self._file_not_found(name)
        self.file_mapping[name] = target
    try:
        obj = self.load_from_pointer(name, **load_kwargs)
        if obj is None:
            return
        if not isinstance(obj, dict):
            raise TypeError(
                f"loader returned non-dict object of type ({type(obj)}"
            )
        self._add_loaded_objects(obj)
        return
    except DebugExceptionPreempted:
        pass
    except KeyboardInterrupt:
        raise
    except NotImplementedError as ex:
        warnings.warn(f"This product's {name} is not yet supported: {ex}.")
    except FileNotFoundError as _ex:
        warnings.warn(f"Unable to find files required by {name}.")
    except Exception as ex:
        warnings.warn(f"Unable to load {name}: {ex}")
    setattr(self, name, self.metaget_(name))

load_all()

Handler (and alias) for Data.load("all").

Source code in pdr/pdr.py
610
611
612
613
614
615
616
617
618
619
620
def load_all(self):
    """Handler (and alias) for `Data.load("all")`."""
    from pdr.loaders.dispatch import OBJECTS_IGNORED_BY_DEFAULT

    for name in self.keys():
        if OBJECTS_IGNORED_BY_DEFAULT.match(name):
            continue
        try:
            self.load(name)
        except AlreadyLoadedError:
            continue

load_from_pointer(pointer: str, **load_kwargs: Any) -> dict[str, Union[pd.DataFrame, np.ndarray, str, MultiDict, 'PVLModule']]

PDS3 data object-loading handler. Set up the appropriate Loader for the object, set up load flow tracking, call the loader, and perform basic cleanup.

Source code in pdr/pdr.py
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
def load_from_pointer(
    self, pointer: str, **load_kwargs: Any
) -> dict[
    str, Union[pd.DataFrame, np.ndarray, str, MultiDict, "PVLModule"]
]:
    """
    PDS3 data object-loading handler. Set up the appropriate `Loader` for
    the object, set up load flow tracking, call the loader, and perform
    basic cleanup.
    """
    from pdr.loaders.dispatch import pointer_to_loader

    loader = pointer_to_loader(pointer, self)
    if self.debug is True:
        loader = Dynamic.from_function(loader, optional=True)
    self.loaders[pointer] = loader
    self.tracker.set_metadata(
        filename=self.file_mapping[pointer], obj=pointer
    )
    obj = self.loaders[pointer](
        self, pointer, tracker=self.tracker, **load_kwargs
    )
    # FITS arrays are scaled by default, and most 'desktop' images never
    # require scaling. we currently treat GeoTIFFs and JP2 as the only
    # exceptions.
    # TODO: assess whether there are non-GeoTIFF TIFFs floating around in
    #  the PDS that might still require scaling.
    unwrap = loader.func.__self__ if self.debug is True else loader
    if (
        (unwrap.__class__.__name__ == "ReadFits")
        and (obj[pointer].__class__.__name__ == "ndarray")
    ):
        self._scaleflags[pointer] = True
    if (
        (unwrap.__class__.__name__ == 'ReadCompressedImage')
        and (obj[pointer].__class__.__name__ == "ndarray")
    ):
        from pdr.loaders.handlers import _check_prescaled_desktop

        self._scaleflags[pointer] = _check_prescaled_desktop(
            self.file_mapping[pointer]
        )
    if self.debug is True and len(loader.errors) > 0:
        warnings.warn(
            f"Unable to load {pointer}: {loader.errors[-1]['exception']}"
        )
        raise DebugExceptionPreempted
    return obj

metablock(text: str, warn: bool = True) -> Optional[Mapping]

get the first value from this object's metadata whose key exactly matches text, even if it is nested inside a mapping, if the value itself is a mapping (e.g., nested PVL block, XML 'area', etc.) evaluate it using self.metadata.formatter. if there is no key matching 'text', will evaluate and return the metadata as a whole. WARNING: this function's return values are memoized for performance. updating elements of self.metadata that have already been accessed with this function will not update future calls to this function.

Source code in pdr/pdr.py
933
934
935
936
937
938
939
940
941
942
943
944
def metablock(self, text: str, warn: bool = True) -> Optional[Mapping]:
    """
    get the first value from this object's metadata whose key exactly
    matches `text`, even if it is nested inside a mapping, if the value
    itself is a mapping (e.g., nested PVL block, XML 'area', etc.)
    evaluate it using self.metadata.formatter. if there is no key matching
    'text', will evaluate and return the metadata as a whole.
    WARNING: this function's return values are memoized for performance.
    updating elements of self.metadata that have already been accessed
    with this function will not update future calls to this function.
    """
    return self.metadata.metablock(text, warn)

metablock_(text: str) -> Optional[Mapping]

quiet-by-default version of metablock

Source code in pdr/pdr.py
946
947
948
def metablock_(self, text: str) -> Optional[Mapping]:
    """quiet-by-default version of metablock"""
    return self.metadata.metablock(text, False)

metaget(text: str, default: Any = None, warn: bool = True) -> Any

get the first value from this object's metadata whose key exactly matches text, even if it is nested inside a mapping. evaluate it using self.metadata.formatter.

Warning

this function's return values are memoized for performance. updating elements of self.metadata that have already been accessed with this function will not update future calls to this function.

Source code in pdr/pdr.py
914
915
916
917
918
919
920
921
922
923
924
925
926
927
def metaget(
    self, text: str, default: Any = None, warn: bool = True
) -> Any:
    """
    get the first value from this object's metadata whose key exactly
    matches `text`, even if it is nested inside a mapping. evaluate it
    using `self.metadata.formatter`.

    Warning:
        this function's return values are memoized for performance.
        updating elements of self.metadata that have already been accessed
        with this function will not update future calls to this function.
    """
    return self.metadata.metaget(text, default, warn)

metaget_(text: str, default: Any = None) -> Any

quiet-by-default version of metaget

Source code in pdr/pdr.py
929
930
931
def metaget_(self, text: str, default: Any = None) -> Any:
    """quiet-by-default version of metaget"""
    return self.metadata.metaget(text, default, False)

read_metadata(pvl_limit: int = DEFAULT_PVL_LIMIT, strict_decode: bool = True) -> Metadata

Attempt to ingest a product's metadata. if it is a PDS4 product, pds4_tools will already have ingested its detached XML label in Data._init_pds4(). In that case, simply preprocess it for Metadata.init. Otherwise, if it has a detached PDS3/PVL label, ingest it with pdr.parselabel.pds3.read_pvl. Then, if we found no detached label, look for an attached PVL label (also using read_pvl). If we are in a "primary" mode, ignore all that and ingest the product's metadata with the appropriate format-specific functions. Then, construct a Metadata object from whatever we loaded and add all the objects it implies to our index.

Source code in pdr/pdr.py
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
def read_metadata(
    self, pvl_limit: int = DEFAULT_PVL_LIMIT, strict_decode: bool = True
) -> Metadata:
    """
    Attempt to ingest a product's metadata. if it is a PDS4 product,
    pds4_tools will already have ingested its detached XML label in
    Data._init_pds4(). In that case, simply preprocess it for
    Metadata.__init__.
    Otherwise, if it has a detached PDS3/PVL label, ingest it with
    pdr.parselabel.pds3.read_pvl.
    Then, if we found no detached label, look for an attached PVL
    label (also using read_pvl).
    If we are in a "primary" mode, ignore all that and ingest the product's
    metadata with the appropriate format-specific functions.
    Then, construct a Metadata object from whatever we loaded and add all
    the objects it implies to our index.
    """
    if self.standard == "FITS":
        from pdr.loaders.handlers import unpack_fits_headers

        mapping, params, self._hdumap = unpack_fits_headers(
            self.filename, hdulist=self._hdulist
        )
        return Metadata((mapping, params), standard="FITS")
    if self.standard == "PDS4":
        return Metadata(
            reformat_pds4_tools_label(self.label), standard="PDS4"
        )
    if self.standard in DESKTOP_IMAGE_STANDARDS:
        from pdr.pil_utils import skim_image_data, paramdig

        return Metadata(paramdig(skim_image_data(self.filename)))
    # self.labelname is None means we didn't find a detached label
    target = self.filename if self.labelname is None else self.labelname
    parsed_pvl = read_pvl(
        target, max_size=pvl_limit, default_strict_decode=strict_decode
    )
    metadata = Metadata(parsed_pvl)
    # we wait until after the read step to make these assignments in order
    # to facilitate debugging in cases where there is not in fact an
    # attached label or we couldn't read it
    self.labelname, self.file_mapping["LABEL"] = target, target
    self.index.append("LABEL")
    return metadata

show(object_name: str = None, scaled: bool = True, **browse_kwargs: Any) -> Image

Produce an Image from a data object associated with this product. A convenient way to quickly look at data.

Source code in pdr/pdr.py
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
def show(
    self,
    object_name: str = None,
    scaled: bool = True,
    **browse_kwargs: Any
) -> Image:
    """
    Produce an Image from a data object associated with this product. A
    convenient way to quickly look at data.
    """
    if object_name is None:
        raise ValueError(
            f"please specify the name of an image object. "
            f"keys include {self.index}"
        )
    if not self[object_name].__class__.__name__ == "ndarray":
        raise TypeError("Data.show only works on array data.")
    if scaled is True:
        obj = self.get_scaled(object_name)
    else:
        obj = self[object_name]
    # no need to have all this mpl stuff in the namespace normally
    from pdr.browsify import _browsify_array

    return _browsify_array(obj, save=False, outbase="", **browse_kwargs)

unloaded() -> tuple[str]

Return names of all identified but unloaded data objects.

Source code in pdr/pdr.py
530
531
532
def unloaded(self) -> tuple[str]:
    """Return names of all identified but unloaded data objects."""
    return tuple(filter(lambda k: k not in dir(self), self.index))

DebugExceptionPreempted

Bases: Exception

Stub Exception subclass for selectively ignoring Exceptions from load failures when not in debug mode.

Source code in pdr/pdr.py
211
212
213
214
215
216
class DebugExceptionPreempted(Exception):
    """
    Stub Exception subclass for selectively ignoring Exceptions from load
    failures when not in debug mode.
    """
    pass

Metadata

Bases: MultiDict

MultiDict subclass intended primarily as a helper class for Data. includes various convenience methods for handling metadata syntaxes, common access and display interfaces, etc.

Source code in pdr/pdr.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
class Metadata(MultiDict):
    """
    MultiDict subclass intended primarily as a helper class for Data.
    includes various convenience methods for handling metadata syntaxes,
    common access and display interfaces, etc.
    """

    def __init__(
        self,
        mapping_params: tuple[Mapping, Collection[str]],
        standard: Literal["PDS3", "PDS4", "FITS"] = "PDS3",
        **kwargs
    ):
        """"""
        mapping, params = mapping_params
        super().__init__(mapping, **kwargs)
        self.fieldcounts = countby(identity, params)
        self.standard = standard
        self.refresh_cache()
        self.identifiers = self._init_identifiers()

    # note that 'directly' caching these methods can result in recursive
    # reference chains behind the lru_cache API that can prevent the
    # Metadata object from being garbage-collected, which is why they are
    # hidden behind these wrappers. there may be a cleaner way to do this.
    def refresh_cache(self):
        self._metaget_interior = _metaget_factory(self)
        self._metablock_interior = _metablock_factory(self)

    def metaget(
        self, text: str, default: Any = None, warn: bool = True
    ) -> Any:
        """
        get the first value from this object whose key exactly matches `text`,
        even if it is nested inside a mapping. optionally evaluate it using
        `self.formatter`. raise a warning if there are multiple keys matching
        this.

        Warning:
            This function's return values are memoized for performance.
            Updating elements of a `Metadata` object's underlying mapping
            that have already been accessed with this function will not update
            future calls to this function.
        """
        count = self.fieldcounts.get(text)
        if count is None:
            return default
        if (count > 1) and (warn is True):
            warnings.warn(
                f"More than one value for {text} exists in the metadata. "
                f"Returning only the first.",
                DuplicateKeyWarning,
            )
        return self._metaget_interior(text, default)

    def metaget_(self, text: str, default: Any = None) -> Any:
        """quiet-by-default version of metaget"""
        return self.metaget(text, default, False)

    def metaget_fuzzy(self, text: str) -> Any:
        """Like `metaget()`, but fuzzy-matches key names."""
        import Levenshtein as lev
        levratio = {
            key: lev.ratio(key, text) for key in set(self.fieldcounts.keys())
        }
        if levratio == {}:
            return None
        peak = max(levratio.values())
        for k, v in filter(lambda kv: kv[1] == peak, levratio.items()):
            return self.metaget(k)

    def metablock(self, text: str, warn: bool = True) -> Optional[Mapping]:
        """
        get the first value from this object whose key exactly
        matches `text`, even if it is nested inside a mapping, if the value
        itself is a mapping (e.g., nested PVL block, XML 'area', etc.)
        evaluate it using self.formatter. raise a warning if there are
        multiple keys matching this.
        if there is no key matching 'text', will evaluate and return the
        metadata as a whole.

        Warning:
            This function's return values are memoized for performance.
            Updating elements of a `Metadata` object's underlying mapping
            that have already been accessed with this function will not update
            future calls to this function.
        """
        count = self.fieldcounts.get(text)
        if count is None:
            return None
        if (count > 1) and (warn is True):
            warnings.warn(
                f"More than one block named {text} exists in the metadata. "
                f"Returning only the first.",
                DuplicateKeyWarning,
            )
        return self._metablock_interior(text)

    def metablock_(self, text: str) -> Optional[Mapping]:
        """quiet-by-default version of metablock"""
        return self.metablock(text, False)

    def _init_identifiers(self) -> DataIdentifiers:
        """
        Initializes common PDS3 data identifiers for use in special-case
        checks.
        """
        identifiers = {
            field: self.metaget_(field, "")
            for field in get_annotations(DataIdentifiers)
        }
        # it never does us any favors to have tuples or sets in here
        for k, v in identifiers.items():
            if isinstance(v, (tuple, set)):
                identifiers[k] = str(v)
        return identifiers

    def __str__(self):
        """"""
        return f"Metadata({prettify_multidict(self)})"

    def __repr__(self):
        """"""
        return f"Metadata({prettify_multidict(self)})"

    _metaget_interior: Callable[[str, Any], Any]
    _metablock_interior: Callable[[str], Mapping]

__init__(mapping_params: tuple[Mapping, Collection[str]], standard: Literal['PDS3', 'PDS4', 'FITS'] = 'PDS3', **kwargs)

Source code in pdr/pdr.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
def __init__(
    self,
    mapping_params: tuple[Mapping, Collection[str]],
    standard: Literal["PDS3", "PDS4", "FITS"] = "PDS3",
    **kwargs
):
    """"""
    mapping, params = mapping_params
    super().__init__(mapping, **kwargs)
    self.fieldcounts = countby(identity, params)
    self.standard = standard
    self.refresh_cache()
    self.identifiers = self._init_identifiers()

__repr__()

Source code in pdr/pdr.py
203
204
205
def __repr__(self):
    """"""
    return f"Metadata({prettify_multidict(self)})"

__str__()

Source code in pdr/pdr.py
199
200
201
def __str__(self):
    """"""
    return f"Metadata({prettify_multidict(self)})"

_init_identifiers() -> DataIdentifiers

Initializes common PDS3 data identifiers for use in special-case checks.

Source code in pdr/pdr.py
184
185
186
187
188
189
190
191
192
193
194
195
196
197
def _init_identifiers(self) -> DataIdentifiers:
    """
    Initializes common PDS3 data identifiers for use in special-case
    checks.
    """
    identifiers = {
        field: self.metaget_(field, "")
        for field in get_annotations(DataIdentifiers)
    }
    # it never does us any favors to have tuples or sets in here
    for k, v in identifiers.items():
        if isinstance(v, (tuple, set)):
            identifiers[k] = str(v)
    return identifiers

metablock(text: str, warn: bool = True) -> Optional[Mapping]

get the first value from this object whose key exactly matches text, even if it is nested inside a mapping, if the value itself is a mapping (e.g., nested PVL block, XML 'area', etc.) evaluate it using self.formatter. raise a warning if there are multiple keys matching this. if there is no key matching 'text', will evaluate and return the metadata as a whole.

Warning

This function's return values are memoized for performance. Updating elements of a Metadata object's underlying mapping that have already been accessed with this function will not update future calls to this function.

Source code in pdr/pdr.py
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
def metablock(self, text: str, warn: bool = True) -> Optional[Mapping]:
    """
    get the first value from this object whose key exactly
    matches `text`, even if it is nested inside a mapping, if the value
    itself is a mapping (e.g., nested PVL block, XML 'area', etc.)
    evaluate it using self.formatter. raise a warning if there are
    multiple keys matching this.
    if there is no key matching 'text', will evaluate and return the
    metadata as a whole.

    Warning:
        This function's return values are memoized for performance.
        Updating elements of a `Metadata` object's underlying mapping
        that have already been accessed with this function will not update
        future calls to this function.
    """
    count = self.fieldcounts.get(text)
    if count is None:
        return None
    if (count > 1) and (warn is True):
        warnings.warn(
            f"More than one block named {text} exists in the metadata. "
            f"Returning only the first.",
            DuplicateKeyWarning,
        )
    return self._metablock_interior(text)

metablock_(text: str) -> Optional[Mapping]

quiet-by-default version of metablock

Source code in pdr/pdr.py
180
181
182
def metablock_(self, text: str) -> Optional[Mapping]:
    """quiet-by-default version of metablock"""
    return self.metablock(text, False)

metaget(text: str, default: Any = None, warn: bool = True) -> Any

get the first value from this object whose key exactly matches text, even if it is nested inside a mapping. optionally evaluate it using self.formatter. raise a warning if there are multiple keys matching this.

Warning

This function's return values are memoized for performance. Updating elements of a Metadata object's underlying mapping that have already been accessed with this function will not update future calls to this function.

Source code in pdr/pdr.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def metaget(
    self, text: str, default: Any = None, warn: bool = True
) -> Any:
    """
    get the first value from this object whose key exactly matches `text`,
    even if it is nested inside a mapping. optionally evaluate it using
    `self.formatter`. raise a warning if there are multiple keys matching
    this.

    Warning:
        This function's return values are memoized for performance.
        Updating elements of a `Metadata` object's underlying mapping
        that have already been accessed with this function will not update
        future calls to this function.
    """
    count = self.fieldcounts.get(text)
    if count is None:
        return default
    if (count > 1) and (warn is True):
        warnings.warn(
            f"More than one value for {text} exists in the metadata. "
            f"Returning only the first.",
            DuplicateKeyWarning,
        )
    return self._metaget_interior(text, default)

metaget_(text: str, default: Any = None) -> Any

quiet-by-default version of metaget

Source code in pdr/pdr.py
137
138
139
def metaget_(self, text: str, default: Any = None) -> Any:
    """quiet-by-default version of metaget"""
    return self.metaget(text, default, False)

metaget_fuzzy(text: str) -> Any

Like metaget(), but fuzzy-matches key names.

Source code in pdr/pdr.py
141
142
143
144
145
146
147
148
149
150
151
def metaget_fuzzy(self, text: str) -> Any:
    """Like `metaget()`, but fuzzy-matches key names."""
    import Levenshtein as lev
    levratio = {
        key: lev.ratio(key, text) for key in set(self.fieldcounts.keys())
    }
    if levratio == {}:
        return None
    peak = max(levratio.values())
    for k, v in filter(lambda kv: kv[1] == peak, levratio.items()):
        return self.metaget(k)

_metablock_factory(metadata: Metadata) -> Callable[[str], Mapping]

Factory function for an internal component of metablock(). Reduces the risk that the metadata access cache will create reference cycles.

Source code in pdr/pdr.py
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
def _metablock_factory(metadata: Metadata) -> Callable[[str], Mapping]:
    """
    Factory function for an internal component of `metablock()`. Reduces the
    risk that the metadata access cache will create reference cycles.
    """
    def metablock_interior(text):
        """"""
        value = dig_for_value(metadata, text, mtypes=(dict, MultiDict))
        if not isinstance(value, Mapping):
            return metadata
        return value

    return cache(metablock_interior)

pdrtypes

Axname: TypeAlias = Literal['BAND', 'LINE', 'SAMPLE'] module-attribute

Conventional names for image axes.

BandStorageType: TypeAlias = Literal['BAND_SEQUENTIAL', 'LINE_INTERLEAVED', 'SAMPLE_INTERLEAVED', None] module-attribute

Codes for physical storage layout of 3-D arrays. Also known as BSQ/band sequential, BIL/band interleaved by line, BIP/band interleaved by pixel. None implies either that the storage layout is unknown or that the array is not 3-D.

ByteOrder: TypeAlias = Literal['<', '>'] module-attribute

Most significant/least significant byteorder codes

LoaderFunction: TypeAlias = Callable[..., Union[str, 'MultiDict', 'pd.DataFrame', 'np.ndarray']] module-attribute

Signature of a Loader's load function

PDRLike: TypeAlias = Union['Data', 'Metadata'] module-attribute

Something with a pdr-style metadata-getting interface

PhysicalTarget: TypeAlias = Union[list[str, int], tuple[str, int], int, str, dict[str, Union[str, int]]] module-attribute

Expected formats of 'pointer' parameters, i.e. ^WHATEVER = PhysicalTarget

DataIdentifiers

Bases: TypedDict

Standard PDS3 'identifiers' Data checks its Metadata for on initialization (if it's made from a PDS3 product). Used primarily to make special case checks more compact. These are taken directly from the label, then stringified if they're sets or tuples. All keys are always present, but may be None if a parameter's not actually in the label.

Source code in pdr/pdrtypes.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
class DataIdentifiers(TypedDict):
    """
    Standard PDS3 'identifiers' Data checks its Metadata for on initialization
    (if it's made from a PDS3 product). Used primarily to make special case
    checks more compact. These are taken directly from the label, then
    stringified if they're sets or tuples. All keys are always present, but
    may be None if a parameter's not actually in the label.
    """
    DATA_SET_ID: Union[str, None]
    DATA_SET_NAME: Union[str, None]
    DATA_QUALITY_DESC: Union[str, None]
    FILE_NAME: Union[str, None]
    FILE_RECORDS: Union[int, None]
    INSTRUMENT_ID: Union[str, None]
    INSTRUMENT_HOST_NAME: Union[str, None]
    INSTRUMENT_NAME: Union[str, None]
    LABEL_RECORDS: Union[int, None]
    NOTE: Union[str, None]
    PRODUCT_ID: Union[str, None]
    PRODUCT_TYPE: Union[str, None]
    RECORD_BYTES: Union[int, None]
    RECORD_TYPE: Union[str, None]
    ROW_BYTES: Union[int, None]
    ROWS: Union[int, None]
    SPACECRAFT_NAME: Union[str, None]
    STANDARD_DATA_PRODUCT_ID: Union[str, None]

ImageProps

Bases: TypedDict

Standard image properties dict used in image-processing workflows.

Source code in pdr/pdrtypes.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
class ImageProps(TypedDict):
    """Standard image properties dict used in image-processing workflows."""
    # Number of bytes per pixel (eventually redundant with sample_type but
    # populated much earlier)
    BYTES_PER_PIXEL: Literal[1, 2, 4, 8]
    # Do the elements of the array, when loaded, represent VAX reals?
    is_vax_real: bool
    # numpy dtype string
    sample_type: str
    # total number of elements
    pixels: int
    # number of elements along each dimension
    nrows: int
    ncols: int
    nbands: int
    # physical storage layout of 3D arrays (None for 2D arrays)
    band_storage_type: BandStorageType
    # total row/column/band pad elements due to ISIS-style axplanes
    rowpad: int
    colpad: int
    bandpad: int
    # number of pad elements for left/right sideplanes
    prefix_rows: Optional[int]
    suffix_rows: Optional[int]
    # number of pad elements for bottom/topplanes
    prefix_cols: Optional[int]
    suffix_cols: Optional[int]
    # number of pad elements for front/backplanes
    prefix_bands: Optional[int]
    suffix_bands: Optional[int]
    # total pad elements due to line prefixes/suffixes
    linepad: int
    # number of elements in line prefix and suffix
    line_prefix_pix: Optional[int]
    line_suffix_pix: Optional[int]
    # Order of axes expressed as a tuple of axis names, only used by ISIS qubes
    axnames: Optional[tuple[Axname]]

pil_utils

Utilities for dealing with 'desktop'-format images using pillow.

not all of this ultimately goes here. Also, we might want to use opencv

for some things instead.

pvl_utils

utilities for working with the pvl library.

TimelessOmniDecoder

Bases: OmniDecoder

Source code in pdr/pvl_utils.py
16
17
18
19
20
21
22
class TimelessOmniDecoder(pvl.decoder.OmniDecoder):
    """"""
    def __init__(self, *args, **kwargs):
        super().__init__(*args, grammar=pvl.grammar.OmniGrammar(), **kwargs)

    def decode_datetime(self, value: str):
        raise ValueError

cached_pvl_load(reference) cached

Source code in pdr/pvl_utils.py
25
26
27
28
29
30
@cache
def cached_pvl_load(reference):
    """"""
    import pvl

    return pvl.load(reference, decoder=TimelessOmniDecoder())

utils

generic i/o, parsing, and functional utilities.

SUPPORTED_COMPRESSION_EXTENSIONS = ('.gz', '.bz2', '.zip') module-attribute

compression 'types' we support

append_repeated_object(obj: Union[Sequence, Mapping], fields: MutableSequence, repeat_count: int) -> MutableSequence

Polymorphic function to append obj repeat_count times to fields. If obj is a non-string sequence, it instead concatenates and adds it. For instance:

>>> append_repeated_object([1, 2], [4], 3)
[4, 1, 2, 1, 2, 1, 2]
>>> append_repeated_object({"a": "b"}, ["a"], 3)
["a", {"a": "b"}, {"a": "b"}, {"a": "b"}]

NOTE: This function treats repeat_count values < 1 as 1. WARNING: this function does not copy obj or any of its elements, even if they are mutable. This is not a bug, but can cause unexpected behavior, so take care (and in particular, always go depth-first if you are using this function in a recursive operation).

Source code in pdr/utils.py
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
def append_repeated_object(
    obj: Union[Sequence, Mapping],
    fields: MutableSequence,
    repeat_count: int,
) -> MutableSequence:
    """
    Polymorphic function to append `obj` `repeat_count` times to `fields`.
    If `obj` is a non-string sequence, it instead concatenates and adds it.
    For instance:
    ```
    >>> append_repeated_object([1, 2], [4], 3)
    [4, 1, 2, 1, 2, 1, 2]
    >>> append_repeated_object({"a": "b"}, ["a"], 3)
    ["a", {"a": "b"}, {"a": "b"}, {"a": "b"}]
    ```
    NOTE: This function treats `repeat_count` values < 1 as 1.
    WARNING: this function does not copy `obj` or any of its elements, even if
    they are mutable. This is not a bug, but can cause unexpected behavior, so
    take care (and in particular, always go depth-first if you are using this
    function in a recursive operation).
    """
    if hasattr(obj, "__add__"):
        if repeat_count > 1:
            fields += chain.from_iterable([obj for _ in range(repeat_count)])
        else:
            fields += obj
    else:
        if repeat_count > 1:
            fields += [obj for _ in range(repeat_count)]
        else:
            fields.append(obj)
    return fields

associate_label_file(data_filename: str, label_filename: Optional[str] = None, skip_check: bool = False) -> Optional[str]

Source code in pdr/utils.py
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
def associate_label_file(
    data_filename: str,
    label_filename: Optional[str] = None,
    skip_check: bool = False,
) -> Optional[str]:
    """"""
    from pdr.loaders.utility import LABEL_EXTENSIONS, CHANG_LBL_EXTENSIONS
    if label_filename is not None:
        return check_cases(Path(label_filename).absolute(), skip_check)
    elif data_filename.lower().endswith(LABEL_EXTENSIONS):
        return check_cases(data_filename)
    # a check for Chang'e labels, where the beginnings contain CEn with n
    # being the mission number
    elif any(f"ce{m}" in data_filename.lower() for m in "123456") and\
            data_filename.lower().endswith(CHANG_LBL_EXTENSIONS):
        return check_cases(data_filename)
    for lext in LABEL_EXTENSIONS:
        try:
            return check_cases(with_extension(data_filename, lext))
        except FileNotFoundError:
            continue
    return None

check_cases(filenames: Union[Collection[Union[Path, str]], Union[Path, str]], skip: bool = False) -> str

check for oddly-cased versions of a specified filename in local path -- very common to have case mismatches between PDS3 labels and actual archive contents. similarly, check common compression extensions.

the skip argument makes the function simply return filename.

Source code in pdr/utils.py
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def check_cases(
    filenames: Union[Collection[Union[Path, str]], Union[Path, str]],
    skip: bool = False,
) -> str:
    """
    check for oddly-cased versions of a specified filename in local path --
    very common to have case mismatches between PDS3 labels and actual archive
    contents. similarly, check common compression extensions.

    the skip argument makes the function simply return filename.
    """
    filenames = listify(filenames)
    for filename in filenames:
        if skip is True:
            return str(filename)
        path = Path(filename)
        if path.exists():
            return str(filename)
        if not path.parent.exists():
            continue
        matches = tuple(
            filter(
                lambda p: stem_path(p) == Path(filename).name.lower(),
                path.parent.iterdir(),
            )
        )
        if len(matches) == 0:
            continue
        if len(matches) > 1:
            warning_list = ", ".join([path.name for path in matches])
            warnings.warn(
                f"Multiple off-case or possibly-compressed versions of "
                f"{filename} found in search path: {warning_list}. Using "
                f"{matches[0].name}."
            )
        return str(matches[0])
    filelist = ";".join([str(f) for f in filenames])
    raise FileNotFoundError(
        f"No candidate paths for required file exist. Checked:{filelist}"
    )

check_primary_fmt(data_filename: str) -> Union[str, None]

Source code in pdr/utils.py
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
def check_primary_fmt(data_filename: str) -> Union[str, None]:
    """"""
    from pdr.loaders.utility import (
        DESKTOP_IMAGE_EXTENSIONS,
        FITS_EXTENSIONS,
        looks_like_this_kind_of_file,
        DESKTOP_IMAGE_STANDARDS
    )

    lower = data_filename.lower()
    for ext in FITS_EXTENSIONS:
        if lower.endswith(ext):
            return "FITS"
    if looks_like_this_kind_of_file(lower, DESKTOP_IMAGE_EXTENSIONS):
        try:
            from PIL import Image, UnidentifiedImageError
        except ImportError:
            raise ModuleNotFoundError(
                "Reading desktop image formats requires the 'pillow' library."
            )
        Image.MAX_IMAGE_PIXELS = None
        try:
            standard = Image.open(data_filename).format
            assert standard in DESKTOP_IMAGE_STANDARDS
        except (UnidentifiedImageError, AssertionError):
            return None
        return standard
    return None

decompress(filename)

Open FILENAME. If its name suffix indicates one of the supported compression algorithms, transparently decompress it.

Source code in pdr/utils.py
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
def decompress(filename):
    """Open FILENAME.  If its name suffix indicates one of the supported
    compression algorithms, transparently decompress it."""
    # open the file directly to ensure that we get a regular OSError
    # (subclass), instead of a GzipError or something, if the file
    # doesn't exist or there's some other OS-level problem with it
    fp = open(filename, "rb")

    # this will be the _last_ suffix only, e.g. "foo.tar.gz" -> ".gz"
    suffix = Path(filename).suffix.lower()
    if suffix == ".gz":
        return import_best_gzip().GzipFile(fileobj=fp)
    if suffix == ".bz2":
        import bz2
        return bz2.BZ2File(fp)
    if suffix == ".zip":
        from zipfile import ZipFile
        z = ZipFile(fp)
        return z.open(z.infolist()[0])
    return fp

find_repository_root(absolute_path)

Source code in pdr/utils.py
187
188
189
190
191
192
193
def find_repository_root(absolute_path):
    """"""
    parts = Path(absolute_path).parts
    data_indices = [
        ix for ix, part in enumerate(parts) if part.lower() == "data"
    ]
    return Path(*parts[: data_indices[-1]])

head_file(fn_or_reader: Union[IO, Path, str], nbytes: Union[int, None] = None, offset: int = 0, tail: bool = False) -> BytesIO

Source code in pdr/utils.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def head_file(
    fn_or_reader: Union[IO, Path, str],
    nbytes: Union[int, None] = None,
    offset: int = 0,
    tail: bool = False,
) -> BytesIO:
    """"""
    head_buffer = BytesIO()
    if not hasattr(fn_or_reader, "read"):
        fn_or_reader = open(fn_or_reader, "rb")
    whence = 2 if tail is True else False
    offset = offset * -1 if tail is True else offset
    fn_or_reader.seek(offset, whence)
    head_buffer.write(fn_or_reader.read(nbytes))
    fn_or_reader.close()
    head_buffer.seek(0)
    return head_buffer

import_best_gzip()

Source code in pdr/utils.py
151
152
153
154
155
156
157
def import_best_gzip():
    """"""
    try:
        from isal import igzip as gzip_lib
    except ImportError:
        import gzip as gzip_lib
    return gzip_lib

prettify_multidict(multi, sep=' ', indent=0)

Source code in pdr/utils.py
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
def prettify_multidict(multi, sep=" ", indent=0):
    """"""
    indentation, output, first_line = "", "{", True
    for k, v in multi.items():
        if sep == "\n":
            indentation = " " * indent
            if first_line is True:
                output += "\n"
        if isinstance(v, MultiDict):
            output += (
                f"{indentation}{k}: "
                f"{prettify_multidict(v, indent = indent + 2)},{sep}"
            )
        elif (not isinstance(v, str)) or (len(v) <= 70):
            output += f"{indentation}{k}: {v},{sep}"
        else:
            lines = textwrap.wrap(v, width=(70 - len(indentation)))
            vstr = f"{lines[0]}\n" + "\n".join(
                [(" " * (indent + 1)) + line for line in lines[1:]]
            )
            output += f"{indentation}{k}: {vstr},{sep}"
        first_line = False
        if sep != " ":
            continue
        if len(output) > 70:
            return prettify_multidict(multi, sep="\n", indent=indent + 1)
    if len(indentation) > 0:
        indentation = indentation[:-1]
    return output + indentation + "}"

read_hex(hex_string: str, fmt: str = '>I') -> Number

return the decimal representation of a hexadecimal number in a given number format (expressed as a struct-style format string, default is unsigned 32-bit integer)

Source code in pdr/utils.py
31
32
33
34
35
36
37
def read_hex(hex_string: str, fmt: str = ">I") -> Number:
    """
    return the decimal representation of a hexadecimal number in a given
    number format (expressed as a struct-style format string, default is
    unsigned 32-bit integer)
    """
    return struct.unpack(fmt, bytes.fromhex(hex_string))[0]

stem_path(path: Path)

convert a Path to lowercase and remove any compression extensions from it to stem for loose matching

Source code in pdr/utils.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def stem_path(path: Path):
    """
    convert a Path to lowercase and remove any compression extensions
    from it to stem for loose matching
    """
    lowercase = path.name.split('.')[0].lower()
    exts = tuple(map(str.lower, path.suffixes))
    if len(exts) == 0:
        return lowercase
    looks_compressed = exts[-1] in SUPPORTED_COMPRESSION_EXTENSIONS
    # remove a trailing compression suffix, unless it's the only suffix
    if len(exts) > 1 and looks_compressed:
        exts = exts[:-1]
    return f"{lowercase}{''.join(exts)}"

with_extension(fn: Union[str, Path], new_suffix: str) -> str

Source code in pdr/utils.py
182
183
184
def with_extension(fn: Union[str, Path], new_suffix: str) -> str:
    """"""
    return str(Path(fn).with_suffix(new_suffix))