Source code for swxsoc_reach.io.file_tools

"""
Provides generic file readers.
"""

import ast
import json
from pathlib import Path

import pandas as pd

__all__ = ["read_file", "read_udl_json", "read_udl_csv"]


[docs] def read_file(file_path: Path) -> pd.DataFrame: """ Reads a file and returns a pandas DataFrame. Parameters ---------- file_path : Path The path to the file to read. Returns ------- pd.DataFrame A DataFrame containing the data from the file. """ if not isinstance(file_path, Path): file_path = Path(file_path) if file_path.suffix.lower() == ".json": return read_udl_json(file_path) elif file_path.suffix.lower() == ".csv": return read_udl_csv(file_path) else: raise ValueError(f"Unsupported file type: {file_path.suffix}")
def _unpack_nested_columns(data: pd.DataFrame) -> pd.DataFrame: """ Unpack nested ``seoList`` and ``senPos`` columns into flat columns and drop the originals. Parameters ---------- data : pd.DataFrame DataFrame containing ``seoList`` and ``senPos`` columns. Returns ------- pd.DataFrame DataFrame with the nested columns replaced by their unpacked values. """ # Unpack seoList data["obDescription"] = data["seoList"].apply(lambda x: x[0]["obDescription"]) data["obValue"] = data["seoList"].apply(lambda x: x[0]["obValue"]) data["obQuality"] = data["seoList"].apply(lambda x: x[0]["obQuality"]) # Unpack senPos for i in range(3): data[f"senPos{i}"] = data["senPos"].apply(lambda x: x[i]) # Drop previously nested columns data = data.drop(columns=["seoList", "senPos"]) return data
[docs] def read_udl_json(file_path: Path) -> pd.DataFrame: """ Reads a UDL JSON file and returns a pandas DataFrame. Unpacks nested JSON structures into a flat DataFrame. Parameters ---------- file_path : Path The path to the UDL JSON file. Returns ------- pd.DataFrame A DataFrame containing the data from the UDL JSON file. """ # Convert to Path if not already if not isinstance(file_path, Path): file_path = Path(file_path) try: data = pd.read_json(file_path) except ValueError: # pd.read_json uses ujson which can fail on very small or very large numeric values; # fall back to the standard library json parser. with open(file_path, "r") as file: data = pd.DataFrame(json.load(file)) return _unpack_nested_columns(data)
[docs] def read_udl_csv(file_path: Path) -> pd.DataFrame: """ Reads a UDL CSV file and returns a pandas DataFrame. Unpacks nested JSON structures in the CSV into a flat DataFrame. Parameters ---------- file_path : Path The path to the UDL CSV file. Returns ------- pd.DataFrame A DataFrame containing the data from the UDL CSV file. """ # Convert to Path if not already if not isinstance(file_path, Path): file_path = Path(file_path) # Read the CSV file into a DataFrame data = pd.read_csv(file_path) # Convert the string representation of lists/dicts to actual lists/dicts. # ast.literal_eval handles both single-quoted (Python repr) and double-quoted (JSON) formats. data["seoList"] = data["seoList"].apply(ast.literal_eval) data["senPos"] = data["senPos"].apply(ast.literal_eval) return _unpack_nested_columns(data)