#!/usr/bin/env python3 """ Test script for processing ETT datasets with different prediction lengths. Processes ETTm1.csv and ETTm2.csv with prediction lengths of 96, 192, 336, 720. """ import os import sys from dataflow import process_and_save_time_series def main(): # Configuration datasets = ['ETTm1', 'ETTm2'] input_len = 96 pred_lengths = [96, 192, 336, 720] slide_step = 1 # Split ratios (train:test:val = 6:2:2) train_ratio = 0.6 test_ratio = 0.2 val_ratio = 0.2 # Base paths data_dir = 'data/ETT-small' output_dir = 'processed_data' # Create output directory if it doesn't exist os.makedirs(output_dir, exist_ok=True) print("Starting ETT dataset processing...") print(f"Input length: {input_len}") print(f"Split ratios - Train: {train_ratio}, Test: {test_ratio}, Val: {val_ratio}") print("-" * 60) # Process each dataset for dataset in datasets: csv_path = os.path.join(data_dir, f"{dataset}.csv") # Check if CSV file exists if not os.path.exists(csv_path): print(f"Warning: {csv_path} not found, skipping...") continue print(f"\nProcessing {dataset}...") # Process each prediction length for pred_len in pred_lengths: output_file = os.path.join(output_dir, f"{dataset}_input{input_len}_pred{pred_len}.npz") print(f" - Prediction length {pred_len} -> {output_file}") try: # Read CSV to get column names and exclude the date column import pandas as pd sample_data = pd.read_csv(csv_path) # Get all columns except the first one (date column) feature_columns = sample_data.columns[1:].tolist() print(f" Features: {feature_columns} (excluding date column)") result = process_and_save_time_series( csv_path=csv_path, output_file=output_file, input_len=input_len, pred_len=pred_len, slide_step=slide_step, train_ratio=train_ratio, test_ratio=test_ratio, val_ratio=val_ratio, selected_columns=feature_columns, date_column='date', freq='h' ) # Print dataset shapes for verification print(f" Train: {result['train_x'].shape} -> {result['train_y'].shape}") print(f" Test: {result['test_x'].shape} -> {result['test_y'].shape}") print(f" Val: {result['val_x'].shape} -> {result['val_y'].shape}") print(f" Train time marks: {result['train_x_mark'].shape} -> {result['train_y_mark'].shape}") print(f" Test time marks: {result['test_x_mark'].shape} -> {result['test_y_mark'].shape}") print(f" Val time marks: {result['val_x_mark'].shape} -> {result['val_y_mark'].shape}") except Exception as e: print(f" Error processing {dataset} with pred_len {pred_len}: {e}") continue print("\n" + "=" * 60) print("Processing completed!") print(f"Output files saved in: {os.path.abspath(output_dir)}") if __name__ == "__main__": main()