Source code for swxsoc_reach.io.file_tools
"""
Provides generic file readers.
"""
import ast
import json
from pathlib import Path
import pandas as pd
__all__ = ["read_file", "read_udl_json", "read_udl_csv"]
[docs]
def read_file(file_path: Path) -> pd.DataFrame:
"""
Reads a file and returns a pandas DataFrame.
Parameters
----------
file_path : Path
The path to the file to read.
Returns
-------
pd.DataFrame
A DataFrame containing the data from the file.
"""
if not isinstance(file_path, Path):
file_path = Path(file_path)
if file_path.suffix.lower() == ".json":
return read_udl_json(file_path)
elif file_path.suffix.lower() == ".csv":
return read_udl_csv(file_path)
else:
raise ValueError(f"Unsupported file type: {file_path.suffix}")
def _unpack_nested_columns(data: pd.DataFrame) -> pd.DataFrame:
"""
Unpack nested ``seoList`` and ``senPos`` columns into flat columns
and drop the originals.
Parameters
----------
data : pd.DataFrame
DataFrame containing ``seoList`` and ``senPos`` columns.
Returns
-------
pd.DataFrame
DataFrame with the nested columns replaced by their unpacked values.
"""
# Unpack seoList
data["obDescription"] = data["seoList"].apply(lambda x: x[0]["obDescription"])
data["obValue"] = data["seoList"].apply(lambda x: x[0]["obValue"])
data["obQuality"] = data["seoList"].apply(lambda x: x[0]["obQuality"])
# Unpack senPos
for i in range(3):
data[f"senPos{i}"] = data["senPos"].apply(lambda x: x[i])
# Drop previously nested columns
data = data.drop(columns=["seoList", "senPos"])
return data
[docs]
def read_udl_json(file_path: Path) -> pd.DataFrame:
"""
Reads a UDL JSON file and returns a pandas DataFrame.
Unpacks nested JSON structures into a flat DataFrame.
Parameters
----------
file_path : Path
The path to the UDL JSON file.
Returns
-------
pd.DataFrame
A DataFrame containing the data from the UDL JSON file.
"""
# Convert to Path if not already
if not isinstance(file_path, Path):
file_path = Path(file_path)
try:
data = pd.read_json(file_path)
except ValueError:
# pd.read_json uses ujson which can fail on very small or very large numeric values;
# fall back to the standard library json parser.
with open(file_path, "r") as file:
data = pd.DataFrame(json.load(file))
return _unpack_nested_columns(data)
[docs]
def read_udl_csv(file_path: Path) -> pd.DataFrame:
"""
Reads a UDL CSV file and returns a pandas DataFrame.
Unpacks nested JSON structures in the CSV into a flat DataFrame.
Parameters
----------
file_path : Path
The path to the UDL CSV file.
Returns
-------
pd.DataFrame
A DataFrame containing the data from the UDL CSV file.
"""
# Convert to Path if not already
if not isinstance(file_path, Path):
file_path = Path(file_path)
# Read the CSV file into a DataFrame
data = pd.read_csv(file_path)
# Convert the string representation of lists/dicts to actual lists/dicts.
# ast.literal_eval handles both single-quoted (Python repr) and double-quoted (JSON) formats.
data["seoList"] = data["seoList"].apply(ast.literal_eval)
data["senPos"] = data["senPos"].apply(ast.literal_eval)
return _unpack_nested_columns(data)