import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler import joblib from utils.timefeatures import time_features def preprocess_time_series( csv_data, input_len, pred_len, slide_step, train_ratio=0.6, test_ratio=0.2, val_ratio=0.2, selected_columns=None, date_column='date', freq='T', ): """ Preprocess time series data from CSV for model training, testing and validation. Applies global Z-score normalization using only training data statistics. Args: csv_data (pd.DataFrame or str): CSV data as DataFrame or path to CSV file input_len (int): Length of input sequence pred_len (int): Length of prediction sequence slide_step (int): Step size for sliding window train_ratio (float): Ratio of data to use for training (default: 0.6) test_ratio (float): Ratio of data to use for testing (default: 0.2) val_ratio (float): Ratio of data to use for validation (default: 0.2) selected_columns (list): List of column names to use (default: None, uses all) date_column (str): Name of the date column (default: 'date') freq (str): Frequency of the time series data (default: 'T' for minutely) Returns: dict: Dictionary containing: - train_x: Training input sequences - train_y: Training target sequences - train_x_mark: Training input time features - train_y_mark: Training target time features - test_x: Testing input sequences - test_y: Testing target sequences - test_x_mark: Testing input time features - test_y_mark: Testing target time features - val_x: Validation input sequences - val_y: Validation target sequences - val_x_mark: Validation input time features - val_y_mark: Validation target time features - scaler: Fitted StandardScaler object for inverse transformation """ # Load data if path to CSV is provided if isinstance(csv_data, str): try: data = pd.read_csv(csv_data) except FileNotFoundError: raise FileNotFoundError(f"CSV file not found: {csv_data}") except Exception as e: raise Exception(f"Error loading CSV file: {e}") else: data = csv_data.copy() # Extract time features from date column if date_column in data.columns: date_index = pd.to_datetime(data[date_column]) if isinstance(date_index, pd.Series): date_index = pd.DatetimeIndex(date_index) time_stamp = time_features(date_index, freq=freq) time_stamp = time_stamp.transpose(1, 0) # Shape: (n_samples, n_time_features) else: raise ValueError(f"Date column '{date_column}' not found in data") # Select columns if specified (excluding date column) if selected_columns is not None: data = data[selected_columns] else: # Use all columns except the date column feature_columns = [col for col in data.columns if col != date_column] data = data[feature_columns] # Validate ratios sum to 1 if abs(train_ratio + test_ratio + val_ratio - 1.0) > 1e-6: raise ValueError(f"Ratios must sum to 1.0, got {train_ratio + test_ratio + val_ratio}") # Calculate split points total_len = len(data) train_len = int(total_len * train_ratio) test_len = int(total_len * test_ratio) # Split data into train, test and validation sets train_data = data.iloc[:train_len].values test_data = data.iloc[train_len:train_len + test_len].values val_data = data.iloc[train_len + test_len:].values # Split time features correspondingly train_time_stamp = time_stamp[:train_len] test_time_stamp = time_stamp[train_len:train_len + test_len] val_time_stamp = time_stamp[train_len + test_len:] # Global Z-Score normalization using only training data statistics scaler = StandardScaler() scaler.fit(train_data) # Fit only on training data to avoid data leakage # Apply normalization to all datasets using the same scaler train_data_scaled = scaler.transform(train_data) test_data_scaled = scaler.transform(test_data) if len(test_data) > 0 else test_data val_data_scaled = scaler.transform(val_data) if len(val_data) > 0 else val_data # Create sliding windows for training data train_x, train_y = create_sliding_windows( train_data_scaled, input_len, pred_len, slide_step ) train_x_mark, train_y_mark = create_sliding_windows( train_time_stamp, input_len, pred_len, slide_step ) # Create sliding windows for testing data if len(test_data) > 0: test_x, test_y = create_sliding_windows( test_data_scaled, input_len, pred_len, slide_step ) test_x_mark, test_y_mark = create_sliding_windows( test_time_stamp, input_len, pred_len, slide_step ) else: test_x, test_y = np.array([]), np.array([]) test_x_mark, test_y_mark = np.array([]), np.array([]) # Create sliding windows for validation data if len(val_data) > 0: val_x, val_y = create_sliding_windows( val_data_scaled, input_len, pred_len, slide_step ) val_x_mark, val_y_mark = create_sliding_windows( val_time_stamp, input_len, pred_len, slide_step ) else: val_x, val_y = np.array([]), np.array([]) val_x_mark, val_y_mark = np.array([]), np.array([]) return { 'train_x': train_x, 'train_y': train_y, 'train_x_mark': train_x_mark, 'train_y_mark': train_y_mark, 'test_x': test_x, 'test_y': test_y, 'test_x_mark': test_x_mark, 'test_y_mark': test_y_mark, 'val_x': val_x, 'val_y': val_y, 'val_x_mark': val_x_mark, 'val_y_mark': val_y_mark, 'scaler': scaler } def create_sliding_windows(data, input_len, pred_len, slide_step): """ Create sliding windows from time series data. Args: data (np.ndarray): Time series data input_len (int): Length of input sequence pred_len (int): Length of prediction sequence slide_step (int): Step size for sliding window Returns: tuple: (X, y) where X is input sequences and y is target sequences """ total_len = input_len + pred_len X, y = [], [] # Start indices for sliding windows start_indices = range(0, len(data) - total_len + 1, slide_step) for start_idx in start_indices: end_idx = start_idx + total_len # Skip if there's not enough data if end_idx > len(data): break # Get window window = data[start_idx:end_idx] # Split window into input and target x = window[:input_len] target = window[input_len:end_idx] X.append(x) y.append(target) # Convert to numpy arrays X = np.array(X) y = np.array(y) return X, y def load_and_split_time_series( csv_path, input_len, pred_len, slide_step, train_ratio=0.6, test_ratio=0.2, val_ratio=0.2, selected_columns=None, date_column='date', freq='T', ): """ Convenience function to load CSV file and preprocess time series data. Args: csv_path (str): Path to CSV file input_len (int): Length of input sequence pred_len (int): Length of prediction sequence slide_step (int): Step size for sliding window train_ratio (float): Ratio of data to use for training (default: 0.6) test_ratio (float): Ratio of data to use for testing (default: 0.2) val_ratio (float): Ratio of data to use for validation (default: 0.2) selected_columns (list): List of column names to use (default: None, uses all) date_column (str): Name of the date column (default: 'date') freq (str): Frequency of the time series data (default: 'T' for minutely) Returns: dict: Dictionary containing processed data including time features """ return preprocess_time_series( csv_path, input_len, pred_len, slide_step, train_ratio, test_ratio, val_ratio, selected_columns, date_column, freq ) def process_and_save_time_series( csv_path, output_file, input_len, pred_len, slide_step, train_ratio=0.6, test_ratio=0.2, val_ratio=0.2, selected_columns=None, date_column='date', freq='T', ): """ Process time series data and save it as an NPZ file along with the fitted scaler. Args: csv_path (str): Path to CSV file output_file (str): Path to output NPZ file input_len (int): Length of input sequence pred_len (int): Length of prediction sequence slide_step (int): Step size for sliding window train_ratio (float): Ratio of data to use for training (default: 0.6) test_ratio (float): Ratio of data to use for testing (default: 0.2) val_ratio (float): Ratio of data to use for validation (default: 0.2) selected_columns (list): List of column names to use (default: None, uses all) date_column (str): Name of the date column (default: 'date') freq (str): Frequency of the time series data (default: 'T' for minutely) Returns: dict: Dictionary containing processed data including time features """ import os import numpy as np # Create output directory if it doesn't exist output_dir = os.path.dirname(os.path.abspath(output_file)) os.makedirs(output_dir, exist_ok=True) # Load and preprocess the time series data result = load_and_split_time_series( csv_path=csv_path, input_len=input_len, pred_len=pred_len, slide_step=slide_step, train_ratio=train_ratio, test_ratio=test_ratio, val_ratio=val_ratio, selected_columns=selected_columns, date_column=date_column, freq=freq ) # Extract the processed data train_x = result['train_x'] train_y = result['train_y'] train_x_mark = result['train_x_mark'] train_y_mark = result['train_y_mark'] test_x = result['test_x'] test_y = result['test_y'] test_x_mark = result['test_x_mark'] test_y_mark = result['test_y_mark'] val_x = result['val_x'] val_y = result['val_y'] val_x_mark = result['val_x_mark'] val_y_mark = result['val_y_mark'] scaler = result['scaler'] # Save the scaler object scaler_file = output_file.replace('.npz', '_scaler.gz') joblib.dump(scaler, scaler_file) print(f"Saved scaler to {scaler_file}") # Save the processed data as .npz file np.savez( output_file, train_x=train_x, train_y=train_y, train_x_mark=train_x_mark, train_y_mark=train_y_mark, test_x=test_x, test_y=test_y, test_x_mark=test_x_mark, test_y_mark=test_y_mark, val_x=val_x, val_y=val_y, val_x_mark=val_x_mark, val_y_mark=val_y_mark ) print(f"Saved processed data to {output_file}") return result