first timesnet try

2025-07-30 21:18:46 +08:00
parent dc8c9f1f09
commit 6ee5c769c4
17 changed files with 2918 additions and 0 deletions
--- a/dataflow/tsf.py
+++ b/dataflow/tsf.py
@ -0,0 +1,337 @@
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+import joblib
+from utils.timefeatures import time_features
+
+
+def preprocess_time_series(
+    csv_data,
+    input_len,
+    pred_len,
+    slide_step,
+    train_ratio=0.6,
+    test_ratio=0.2,
+    val_ratio=0.2,
+    selected_columns=None,
+    date_column='date',
+    freq='T',
+):
+    """
+    Preprocess time series data from CSV for model training, testing and validation.
+    Applies global Z-score normalization using only training data statistics.
+    
+    Args:
+        csv_data (pd.DataFrame or str): CSV data as DataFrame or path to CSV file
+        input_len (int): Length of input sequence
+        pred_len (int): Length of prediction sequence
+        slide_step (int): Step size for sliding window
+        train_ratio (float): Ratio of data to use for training (default: 0.6)
+        test_ratio (float): Ratio of data to use for testing (default: 0.2)
+        val_ratio (float): Ratio of data to use for validation (default: 0.2)
+        selected_columns (list): List of column names to use (default: None, uses all)
+        date_column (str): Name of the date column (default: 'date')
+        freq (str): Frequency of the time series data (default: 'T' for minutely)
+        
+    Returns:
+        dict: Dictionary containing:
+            - train_x: Training input sequences
+            - train_y: Training target sequences
+            - train_x_mark: Training input time features
+            - train_y_mark: Training target time features
+            - test_x: Testing input sequences
+            - test_y: Testing target sequences
+            - test_x_mark: Testing input time features
+            - test_y_mark: Testing target time features
+            - val_x: Validation input sequences
+            - val_y: Validation target sequences
+            - val_x_mark: Validation input time features
+            - val_y_mark: Validation target time features
+            - scaler: Fitted StandardScaler object for inverse transformation
+    """
+    # Load data if path to CSV is provided
+    if isinstance(csv_data, str):
+        try:
+            data = pd.read_csv(csv_data)
+        except FileNotFoundError:
+            raise FileNotFoundError(f"CSV file not found: {csv_data}")
+        except Exception as e:
+            raise Exception(f"Error loading CSV file: {e}")
+    else:
+        data = csv_data.copy()
+    
+    # Extract time features from date column
+    if date_column in data.columns:
+        date_index = pd.to_datetime(data[date_column])
+        if isinstance(date_index, pd.Series):
+            date_index = pd.DatetimeIndex(date_index)
+        time_stamp = time_features(date_index, freq=freq)
+        time_stamp = time_stamp.transpose(1, 0)  # Shape: (n_samples, n_time_features)
+    else:
+        raise ValueError(f"Date column '{date_column}' not found in data")
+    
+    # Select columns if specified (excluding date column)
+    if selected_columns is not None:
+        data = data[selected_columns]
+    else:
+        # Use all columns except the date column
+        feature_columns = [col for col in data.columns if col != date_column]
+        data = data[feature_columns]
+    
+    # Validate ratios sum to 1
+    if abs(train_ratio + test_ratio + val_ratio - 1.0) > 1e-6:
+        raise ValueError(f"Ratios must sum to 1.0, got {train_ratio + test_ratio + val_ratio}")
+    
+    # Calculate split points
+    total_len = len(data)
+    train_len = int(total_len * train_ratio)
+    test_len = int(total_len * test_ratio)
+    
+    # Split data into train, test and validation sets
+    train_data = data.iloc[:train_len].values
+    test_data = data.iloc[train_len:train_len + test_len].values
+    val_data = data.iloc[train_len + test_len:].values
+    
+    # Split time features correspondingly
+    train_time_stamp = time_stamp[:train_len]
+    test_time_stamp = time_stamp[train_len:train_len + test_len]
+    val_time_stamp = time_stamp[train_len + test_len:]
+    
+    # Global Z-Score normalization using only training data statistics
+    scaler = StandardScaler()
+    scaler.fit(train_data)  # Fit only on training data to avoid data leakage
+    
+    # Apply normalization to all datasets using the same scaler
+    train_data_scaled = scaler.transform(train_data)
+    test_data_scaled = scaler.transform(test_data) if len(test_data) > 0 else test_data
+    val_data_scaled = scaler.transform(val_data) if len(val_data) > 0 else val_data
+    
+    # Create sliding windows for training data
+    train_x, train_y = create_sliding_windows(
+        train_data_scaled, input_len, pred_len, slide_step
+    )
+    train_x_mark, train_y_mark = create_sliding_windows(
+        train_time_stamp, input_len, pred_len, slide_step
+    )
+    
+    # Create sliding windows for testing data
+    if len(test_data) > 0:
+        test_x, test_y = create_sliding_windows(
+            test_data_scaled, input_len, pred_len, slide_step
+        )
+        test_x_mark, test_y_mark = create_sliding_windows(
+            test_time_stamp, input_len, pred_len, slide_step
+        )
+    else:
+        test_x, test_y = np.array([]), np.array([])
+        test_x_mark, test_y_mark = np.array([]), np.array([])
+    
+    # Create sliding windows for validation data
+    if len(val_data) > 0:
+        val_x, val_y = create_sliding_windows(
+            val_data_scaled, input_len, pred_len, slide_step
+        )
+        val_x_mark, val_y_mark = create_sliding_windows(
+            val_time_stamp, input_len, pred_len, slide_step
+        )
+    else:
+        val_x, val_y = np.array([]), np.array([])
+        val_x_mark, val_y_mark = np.array([]), np.array([])
+    
+    return {
+        'train_x': train_x,
+        'train_y': train_y,
+        'train_x_mark': train_x_mark,
+        'train_y_mark': train_y_mark,
+        'test_x': test_x,
+        'test_y': test_y,
+        'test_x_mark': test_x_mark,
+        'test_y_mark': test_y_mark,
+        'val_x': val_x,
+        'val_y': val_y,
+        'val_x_mark': val_x_mark,
+        'val_y_mark': val_y_mark,
+        'scaler': scaler
+    }
+
+
+def create_sliding_windows(data, input_len, pred_len, slide_step):
+    """
+    Create sliding windows from time series data.
+    
+    Args:
+        data (np.ndarray): Time series data
+        input_len (int): Length of input sequence
+        pred_len (int): Length of prediction sequence
+        slide_step (int): Step size for sliding window
+        
+    Returns:
+        tuple: (X, y) where X is input sequences and y is target sequences
+    """
+    total_len = input_len + pred_len
+    X, y = [], []
+    
+    # Start indices for sliding windows
+    start_indices = range(0, len(data) - total_len + 1, slide_step)
+    
+    for start_idx in start_indices:
+        end_idx = start_idx + total_len
+        
+        # Skip if there's not enough data
+        if end_idx > len(data):
+            break
+            
+        # Get window
+        window = data[start_idx:end_idx]
+        
+        # Split window into input and target
+        x = window[:input_len]
+        target = window[input_len:end_idx]
+        
+        X.append(x)
+        y.append(target)
+    
+    # Convert to numpy arrays
+    X = np.array(X)
+    y = np.array(y)
+    
+    return X, y
+
+
+def load_and_split_time_series(
+    csv_path,
+    input_len,
+    pred_len,
+    slide_step,
+    train_ratio=0.6,
+    test_ratio=0.2,
+    val_ratio=0.2,
+    selected_columns=None,
+    date_column='date',
+    freq='T',
+):
+    """
+    Convenience function to load CSV file and preprocess time series data.
+    
+    Args:
+        csv_path (str): Path to CSV file
+        input_len (int): Length of input sequence
+        pred_len (int): Length of prediction sequence
+        slide_step (int): Step size for sliding window
+        train_ratio (float): Ratio of data to use for training (default: 0.6)
+        test_ratio (float): Ratio of data to use for testing (default: 0.2)
+        val_ratio (float): Ratio of data to use for validation (default: 0.2)
+        selected_columns (list): List of column names to use (default: None, uses all)
+        date_column (str): Name of the date column (default: 'date')
+        freq (str): Frequency of the time series data (default: 'T' for minutely)
+        
+    Returns:
+        dict: Dictionary containing processed data including time features
+    """
+    return preprocess_time_series(
+        csv_path,
+        input_len,
+        pred_len,
+        slide_step,
+        train_ratio,
+        test_ratio,
+        val_ratio,
+        selected_columns,
+        date_column,
+        freq
+    )
+
+
+def process_and_save_time_series(
+    csv_path,
+    output_file,
+    input_len,
+    pred_len,
+    slide_step,
+    train_ratio=0.6,
+    test_ratio=0.2,
+    val_ratio=0.2,
+    selected_columns=None,
+    date_column='date',
+    freq='T',
+):
+    """
+    Process time series data and save it as an NPZ file along with the fitted scaler.
+    
+    Args:
+        csv_path (str): Path to CSV file
+        output_file (str): Path to output NPZ file
+        input_len (int): Length of input sequence
+        pred_len (int): Length of prediction sequence
+        slide_step (int): Step size for sliding window
+        train_ratio (float): Ratio of data to use for training (default: 0.6)
+        test_ratio (float): Ratio of data to use for testing (default: 0.2)
+        val_ratio (float): Ratio of data to use for validation (default: 0.2)
+        selected_columns (list): List of column names to use (default: None, uses all)
+        date_column (str): Name of the date column (default: 'date')
+        freq (str): Frequency of the time series data (default: 'T' for minutely)
+        
+    Returns:
+        dict: Dictionary containing processed data including time features
+    """
+    import os
+    import numpy as np
+    
+    # Create output directory if it doesn't exist
+    output_dir = os.path.dirname(os.path.abspath(output_file))
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # Load and preprocess the time series data
+    result = load_and_split_time_series(
+        csv_path=csv_path,
+        input_len=input_len,
+        pred_len=pred_len,
+        slide_step=slide_step,
+        train_ratio=train_ratio,
+        test_ratio=test_ratio,
+        val_ratio=val_ratio,
+        selected_columns=selected_columns,
+        date_column=date_column,
+        freq=freq
+    )
+    
+    # Extract the processed data
+    train_x = result['train_x']
+    train_y = result['train_y']
+    train_x_mark = result['train_x_mark']
+    train_y_mark = result['train_y_mark']
+    test_x = result['test_x']
+    test_y = result['test_y']
+    test_x_mark = result['test_x_mark']
+    test_y_mark = result['test_y_mark']
+    val_x = result['val_x']
+    val_y = result['val_y']
+    val_x_mark = result['val_x_mark']
+    val_y_mark = result['val_y_mark']
+    scaler = result['scaler']
+    
+    # Save the scaler object
+    scaler_file = output_file.replace('.npz', '_scaler.gz')
+    joblib.dump(scaler, scaler_file)
+    print(f"Saved scaler to {scaler_file}")
+    
+    # Save the processed data as .npz file
+    np.savez(
+        output_file,
+        train_x=train_x,
+        train_y=train_y,
+        train_x_mark=train_x_mark,
+        train_y_mark=train_y_mark,
+        test_x=test_x,
+        test_y=test_y,
+        test_x_mark=test_x_mark,
+        test_y_mark=test_y_mark,
+        val_x=val_x,
+        val_y=val_y,
+        val_x_mark=val_x_mark,
+        val_y_mark=val_y_mark
+    )
+    
+    print(f"Saved processed data to {output_file}")
+    
+    return result