first timesnet try
This commit is contained in:
337
dataflow/tsf.py
Normal file
337
dataflow/tsf.py
Normal file
@ -0,0 +1,337 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
import joblib
|
||||
from utils.timefeatures import time_features
|
||||
|
||||
|
||||
def preprocess_time_series(
|
||||
csv_data,
|
||||
input_len,
|
||||
pred_len,
|
||||
slide_step,
|
||||
train_ratio=0.6,
|
||||
test_ratio=0.2,
|
||||
val_ratio=0.2,
|
||||
selected_columns=None,
|
||||
date_column='date',
|
||||
freq='T',
|
||||
):
|
||||
"""
|
||||
Preprocess time series data from CSV for model training, testing and validation.
|
||||
Applies global Z-score normalization using only training data statistics.
|
||||
|
||||
Args:
|
||||
csv_data (pd.DataFrame or str): CSV data as DataFrame or path to CSV file
|
||||
input_len (int): Length of input sequence
|
||||
pred_len (int): Length of prediction sequence
|
||||
slide_step (int): Step size for sliding window
|
||||
train_ratio (float): Ratio of data to use for training (default: 0.6)
|
||||
test_ratio (float): Ratio of data to use for testing (default: 0.2)
|
||||
val_ratio (float): Ratio of data to use for validation (default: 0.2)
|
||||
selected_columns (list): List of column names to use (default: None, uses all)
|
||||
date_column (str): Name of the date column (default: 'date')
|
||||
freq (str): Frequency of the time series data (default: 'T' for minutely)
|
||||
|
||||
Returns:
|
||||
dict: Dictionary containing:
|
||||
- train_x: Training input sequences
|
||||
- train_y: Training target sequences
|
||||
- train_x_mark: Training input time features
|
||||
- train_y_mark: Training target time features
|
||||
- test_x: Testing input sequences
|
||||
- test_y: Testing target sequences
|
||||
- test_x_mark: Testing input time features
|
||||
- test_y_mark: Testing target time features
|
||||
- val_x: Validation input sequences
|
||||
- val_y: Validation target sequences
|
||||
- val_x_mark: Validation input time features
|
||||
- val_y_mark: Validation target time features
|
||||
- scaler: Fitted StandardScaler object for inverse transformation
|
||||
"""
|
||||
# Load data if path to CSV is provided
|
||||
if isinstance(csv_data, str):
|
||||
try:
|
||||
data = pd.read_csv(csv_data)
|
||||
except FileNotFoundError:
|
||||
raise FileNotFoundError(f"CSV file not found: {csv_data}")
|
||||
except Exception as e:
|
||||
raise Exception(f"Error loading CSV file: {e}")
|
||||
else:
|
||||
data = csv_data.copy()
|
||||
|
||||
# Extract time features from date column
|
||||
if date_column in data.columns:
|
||||
date_index = pd.to_datetime(data[date_column])
|
||||
if isinstance(date_index, pd.Series):
|
||||
date_index = pd.DatetimeIndex(date_index)
|
||||
time_stamp = time_features(date_index, freq=freq)
|
||||
time_stamp = time_stamp.transpose(1, 0) # Shape: (n_samples, n_time_features)
|
||||
else:
|
||||
raise ValueError(f"Date column '{date_column}' not found in data")
|
||||
|
||||
# Select columns if specified (excluding date column)
|
||||
if selected_columns is not None:
|
||||
data = data[selected_columns]
|
||||
else:
|
||||
# Use all columns except the date column
|
||||
feature_columns = [col for col in data.columns if col != date_column]
|
||||
data = data[feature_columns]
|
||||
|
||||
# Validate ratios sum to 1
|
||||
if abs(train_ratio + test_ratio + val_ratio - 1.0) > 1e-6:
|
||||
raise ValueError(f"Ratios must sum to 1.0, got {train_ratio + test_ratio + val_ratio}")
|
||||
|
||||
# Calculate split points
|
||||
total_len = len(data)
|
||||
train_len = int(total_len * train_ratio)
|
||||
test_len = int(total_len * test_ratio)
|
||||
|
||||
# Split data into train, test and validation sets
|
||||
train_data = data.iloc[:train_len].values
|
||||
test_data = data.iloc[train_len:train_len + test_len].values
|
||||
val_data = data.iloc[train_len + test_len:].values
|
||||
|
||||
# Split time features correspondingly
|
||||
train_time_stamp = time_stamp[:train_len]
|
||||
test_time_stamp = time_stamp[train_len:train_len + test_len]
|
||||
val_time_stamp = time_stamp[train_len + test_len:]
|
||||
|
||||
# Global Z-Score normalization using only training data statistics
|
||||
scaler = StandardScaler()
|
||||
scaler.fit(train_data) # Fit only on training data to avoid data leakage
|
||||
|
||||
# Apply normalization to all datasets using the same scaler
|
||||
train_data_scaled = scaler.transform(train_data)
|
||||
test_data_scaled = scaler.transform(test_data) if len(test_data) > 0 else test_data
|
||||
val_data_scaled = scaler.transform(val_data) if len(val_data) > 0 else val_data
|
||||
|
||||
# Create sliding windows for training data
|
||||
train_x, train_y = create_sliding_windows(
|
||||
train_data_scaled, input_len, pred_len, slide_step
|
||||
)
|
||||
train_x_mark, train_y_mark = create_sliding_windows(
|
||||
train_time_stamp, input_len, pred_len, slide_step
|
||||
)
|
||||
|
||||
# Create sliding windows for testing data
|
||||
if len(test_data) > 0:
|
||||
test_x, test_y = create_sliding_windows(
|
||||
test_data_scaled, input_len, pred_len, slide_step
|
||||
)
|
||||
test_x_mark, test_y_mark = create_sliding_windows(
|
||||
test_time_stamp, input_len, pred_len, slide_step
|
||||
)
|
||||
else:
|
||||
test_x, test_y = np.array([]), np.array([])
|
||||
test_x_mark, test_y_mark = np.array([]), np.array([])
|
||||
|
||||
# Create sliding windows for validation data
|
||||
if len(val_data) > 0:
|
||||
val_x, val_y = create_sliding_windows(
|
||||
val_data_scaled, input_len, pred_len, slide_step
|
||||
)
|
||||
val_x_mark, val_y_mark = create_sliding_windows(
|
||||
val_time_stamp, input_len, pred_len, slide_step
|
||||
)
|
||||
else:
|
||||
val_x, val_y = np.array([]), np.array([])
|
||||
val_x_mark, val_y_mark = np.array([]), np.array([])
|
||||
|
||||
return {
|
||||
'train_x': train_x,
|
||||
'train_y': train_y,
|
||||
'train_x_mark': train_x_mark,
|
||||
'train_y_mark': train_y_mark,
|
||||
'test_x': test_x,
|
||||
'test_y': test_y,
|
||||
'test_x_mark': test_x_mark,
|
||||
'test_y_mark': test_y_mark,
|
||||
'val_x': val_x,
|
||||
'val_y': val_y,
|
||||
'val_x_mark': val_x_mark,
|
||||
'val_y_mark': val_y_mark,
|
||||
'scaler': scaler
|
||||
}
|
||||
|
||||
|
||||
def create_sliding_windows(data, input_len, pred_len, slide_step):
|
||||
"""
|
||||
Create sliding windows from time series data.
|
||||
|
||||
Args:
|
||||
data (np.ndarray): Time series data
|
||||
input_len (int): Length of input sequence
|
||||
pred_len (int): Length of prediction sequence
|
||||
slide_step (int): Step size for sliding window
|
||||
|
||||
Returns:
|
||||
tuple: (X, y) where X is input sequences and y is target sequences
|
||||
"""
|
||||
total_len = input_len + pred_len
|
||||
X, y = [], []
|
||||
|
||||
# Start indices for sliding windows
|
||||
start_indices = range(0, len(data) - total_len + 1, slide_step)
|
||||
|
||||
for start_idx in start_indices:
|
||||
end_idx = start_idx + total_len
|
||||
|
||||
# Skip if there's not enough data
|
||||
if end_idx > len(data):
|
||||
break
|
||||
|
||||
# Get window
|
||||
window = data[start_idx:end_idx]
|
||||
|
||||
# Split window into input and target
|
||||
x = window[:input_len]
|
||||
target = window[input_len:end_idx]
|
||||
|
||||
X.append(x)
|
||||
y.append(target)
|
||||
|
||||
# Convert to numpy arrays
|
||||
X = np.array(X)
|
||||
y = np.array(y)
|
||||
|
||||
return X, y
|
||||
|
||||
|
||||
def load_and_split_time_series(
|
||||
csv_path,
|
||||
input_len,
|
||||
pred_len,
|
||||
slide_step,
|
||||
train_ratio=0.6,
|
||||
test_ratio=0.2,
|
||||
val_ratio=0.2,
|
||||
selected_columns=None,
|
||||
date_column='date',
|
||||
freq='T',
|
||||
):
|
||||
"""
|
||||
Convenience function to load CSV file and preprocess time series data.
|
||||
|
||||
Args:
|
||||
csv_path (str): Path to CSV file
|
||||
input_len (int): Length of input sequence
|
||||
pred_len (int): Length of prediction sequence
|
||||
slide_step (int): Step size for sliding window
|
||||
train_ratio (float): Ratio of data to use for training (default: 0.6)
|
||||
test_ratio (float): Ratio of data to use for testing (default: 0.2)
|
||||
val_ratio (float): Ratio of data to use for validation (default: 0.2)
|
||||
selected_columns (list): List of column names to use (default: None, uses all)
|
||||
date_column (str): Name of the date column (default: 'date')
|
||||
freq (str): Frequency of the time series data (default: 'T' for minutely)
|
||||
|
||||
Returns:
|
||||
dict: Dictionary containing processed data including time features
|
||||
"""
|
||||
return preprocess_time_series(
|
||||
csv_path,
|
||||
input_len,
|
||||
pred_len,
|
||||
slide_step,
|
||||
train_ratio,
|
||||
test_ratio,
|
||||
val_ratio,
|
||||
selected_columns,
|
||||
date_column,
|
||||
freq
|
||||
)
|
||||
|
||||
|
||||
def process_and_save_time_series(
|
||||
csv_path,
|
||||
output_file,
|
||||
input_len,
|
||||
pred_len,
|
||||
slide_step,
|
||||
train_ratio=0.6,
|
||||
test_ratio=0.2,
|
||||
val_ratio=0.2,
|
||||
selected_columns=None,
|
||||
date_column='date',
|
||||
freq='T',
|
||||
):
|
||||
"""
|
||||
Process time series data and save it as an NPZ file along with the fitted scaler.
|
||||
|
||||
Args:
|
||||
csv_path (str): Path to CSV file
|
||||
output_file (str): Path to output NPZ file
|
||||
input_len (int): Length of input sequence
|
||||
pred_len (int): Length of prediction sequence
|
||||
slide_step (int): Step size for sliding window
|
||||
train_ratio (float): Ratio of data to use for training (default: 0.6)
|
||||
test_ratio (float): Ratio of data to use for testing (default: 0.2)
|
||||
val_ratio (float): Ratio of data to use for validation (default: 0.2)
|
||||
selected_columns (list): List of column names to use (default: None, uses all)
|
||||
date_column (str): Name of the date column (default: 'date')
|
||||
freq (str): Frequency of the time series data (default: 'T' for minutely)
|
||||
|
||||
Returns:
|
||||
dict: Dictionary containing processed data including time features
|
||||
"""
|
||||
import os
|
||||
import numpy as np
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
output_dir = os.path.dirname(os.path.abspath(output_file))
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Load and preprocess the time series data
|
||||
result = load_and_split_time_series(
|
||||
csv_path=csv_path,
|
||||
input_len=input_len,
|
||||
pred_len=pred_len,
|
||||
slide_step=slide_step,
|
||||
train_ratio=train_ratio,
|
||||
test_ratio=test_ratio,
|
||||
val_ratio=val_ratio,
|
||||
selected_columns=selected_columns,
|
||||
date_column=date_column,
|
||||
freq=freq
|
||||
)
|
||||
|
||||
# Extract the processed data
|
||||
train_x = result['train_x']
|
||||
train_y = result['train_y']
|
||||
train_x_mark = result['train_x_mark']
|
||||
train_y_mark = result['train_y_mark']
|
||||
test_x = result['test_x']
|
||||
test_y = result['test_y']
|
||||
test_x_mark = result['test_x_mark']
|
||||
test_y_mark = result['test_y_mark']
|
||||
val_x = result['val_x']
|
||||
val_y = result['val_y']
|
||||
val_x_mark = result['val_x_mark']
|
||||
val_y_mark = result['val_y_mark']
|
||||
scaler = result['scaler']
|
||||
|
||||
# Save the scaler object
|
||||
scaler_file = output_file.replace('.npz', '_scaler.gz')
|
||||
joblib.dump(scaler, scaler_file)
|
||||
print(f"Saved scaler to {scaler_file}")
|
||||
|
||||
# Save the processed data as .npz file
|
||||
np.savez(
|
||||
output_file,
|
||||
train_x=train_x,
|
||||
train_y=train_y,
|
||||
train_x_mark=train_x_mark,
|
||||
train_y_mark=train_y_mark,
|
||||
test_x=test_x,
|
||||
test_y=test_y,
|
||||
test_x_mark=test_x_mark,
|
||||
test_y_mark=test_y_mark,
|
||||
val_x=val_x,
|
||||
val_y=val_y,
|
||||
val_x_mark=val_x_mark,
|
||||
val_y_mark=val_y_mark
|
||||
)
|
||||
|
||||
print(f"Saved processed data to {output_file}")
|
||||
|
||||
return result
|
Reference in New Issue
Block a user