LSTM (test outsample)¶

In [1]:
import pandas as pd
import numpy as np
import os, sys
from pathlib import Path
import torch
from datetime import datetime
import logging
import mlflow
import warnings

# Set up paths and logging
root_folder_path = Path('/home/ytli/research')
experiment_folder_path = Path('/home/ytli/research/lstm')
sys.path.append(str(root_folder_path))
sys.path.append(str(experiment_folder_path))

from modules.study_multivar import RealDataTimeSeriesAnalysis, calculate_mse_by_subject_feature
from modules.plot import plot_subject_feature_html_interactive
In [2]:
study_name = "study5-realdata"
folder_path = "fisher"
method_name = "lstm"
real_data_path = experiment_folder_path / study_name / folder_path
datafile_path = "data/fisher_all.csv"

# Initialize the analysis
analysis = RealDataTimeSeriesAnalysis(study_name, folder_path, datafile_path)

# Load and prepare data
# feature_cols = ['energetic','enthusiastic','content','irritable','restless','worried','guilty','afraid','anhedonia','angry','hopeless','down','positive','fatigue','tension','concentrate','ruminate','avoid_act','reassure','procrast','avoid_people']
feature_cols =  [
    'down', 'positive', 'content', 'enthusiastic', 'energetic',
    'hopeless', 'angry', 'irritable', 'reassure'
]

df = analysis.load_data()
subject_ids = df['subject_id'].unique()
In [3]:
with open(real_data_path / 'data' / 'random_train_subject_list.txt', 'r') as f:
    random_train_subject_list = [line.strip() for line in f.readlines()]

with open(real_data_path / 'data' / 'random_test_subject_list.txt', 'r') as f:
    random_test_subject_list = [line.strip() for line in f.readlines()]

print("random_train_subject_list", random_train_subject_list)
print("random_test_subject_list", random_test_subject_list)
random_train_subject_list ['p007', 'p040', 'p072', 'p033', 'p137', 'p203', 'p100', 'p010', 'p204', 'p215', 'p001', 'p008', 'p019', 'p169', 'p021', 'p014', 'p074', 'p139', 'p111', 'p048']
random_test_subject_list ['p004', 'p075', 'p117', 'p025', 'p113', 'p145', 'p163', 'p006', 'p217', 'p127', 'p012', 'p160', 'p115', 'p037', 'p013', 'p023', 'p202', 'p009', 'p068', 'p003']
In [4]:
sliding_windows_dict = analysis.create_sliding_windows(df, feature_cols, window_size=5, stride=1)

train_sliding_windows_dict = {} 
test_sliding_windows_dict = {}
for subject_id in random_train_subject_list:
    train_sliding_windows_dict[subject_id] = sliding_windows_dict[subject_id]
for subject_id in random_test_subject_list:
    test_sliding_windows_dict[subject_id] = sliding_windows_dict[subject_id]

train_loader, val_loader, _ = analysis.prepare_datasets(train_sliding_windows_dict)
test_loader, _, _ = analysis.prepare_datasets(test_sliding_windows_dict)

# Train model
print("\033[1m\033[95mTraining model...\033[0m")
best_model, best_model_checkpoint_metrics, test_results = analysis.train_model(train_loader, val_loader, test_loader)
print(f"\033[1m\033[96mBEST MODEL VAL LOSS:\033[0m {best_model_checkpoint_metrics['val_loss']:.5f}")
print(f"\033[1m\033[96mTEST RESULTS:\033[0m loss = {test_results[0]['test_loss']:.5f}")

# Evaluate and visualize
print("\033[1m\033[95mEvaluating model on test data...\033[0m")
test_eval_data = analysis.evaluate_model(best_model, test_loader)
print("="*80)
2025/05/22 10:47:01 WARNING mlflow.utils.autologging_utils: MLflow pytorch autologging is known to be compatible with 1.9.0 <= torch <= 2.6.0, but the installed version is 2.6.0+cu124. If you encounter errors during autologging, try upgrading / downgrading torch to a compatible version, or try upgrading MLflow.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Training model...
2025/05/22 10:47:10 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/ytli/miniconda3/envs/research/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:476: Your `test_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
🏃 View run fisher at: http://localhost:8093/#/experiments/92/runs/c5ec109970594df3a7e975e5d78a2619
🧪 View experiment at: http://localhost:8093/#/experiments/92
BEST MODEL VAL LOSS: 0.72193
TEST RESULTS: loss = 1.73910
Evaluating model on test data...
================================================================================
In [5]:
plot_subject_feature_html_interactive(test_eval_data, feature_cols)
In [6]:
mse_df = calculate_mse_by_subject_feature(test_eval_data, feature_cols)
mse_df.to_csv(f'mse-generalize.csv', index=False)
mse_df.head(20)
Out[6]:
subject_id feature_name mse
0 p004 down 1.064111
1 p004 positive 1.522837
2 p004 content 1.389245
3 p004 enthusiastic 1.459119
4 p004 energetic 1.225313
5 p004 hopeless 1.329111
6 p004 angry 0.936798
7 p004 irritable 1.166241
8 p004 reassure 1.024458
9 p075 down 0.941535
10 p075 positive 1.033084
11 p075 content 1.044805
12 p075 enthusiastic 0.889902
13 p075 energetic 1.104642
14 p075 hopeless 0.988972
15 p075 angry 0.815556
16 p075 irritable 1.263785
17 p075 reassure 0.749723
18 p117 down 0.959132
19 p117 positive 1.213930
In [ ]: