LSTM 모델

david100gom 2025. 2. 27. 13:58

온도 데이터를 사용하여 LSTM 모델을 훈련하고 예측하는 완전한 파이썬 스크립트입니다. 주요 기능은 다음과 같습니다:

데이터 준비: 샘플 온도 데이터를 생성합니다 (실제 구현 시 CSV 파일 등에서 데이터를 로드할 수 있습니다).
데이터 전처리: 정규화와 시퀀스 데이터 생성을 수행합니다.
LSTM 모델 구축: 2층 LSTM 모델을 생성하고 Dropout을 적용하여 과적합을 방지합니다.
모델 학습: 조기 종료(Early Stopping) 콜백을 사용하여 모델이 과적합되기 전에 학습을 중단합니다.
예측 및 평가: 테스트 데이터로 모델을 평가하고 성능을 시각화합니다.
이상 감지: 예측값과 실제값의 차이를 기반으로 이상점을 감지합니다.
미래 예측: 마지막 시퀀스를 사용하여 미래 24시간의 온도를 예측합니다.
모델 저장: 훈련된 모델을 파일로 저장합니다.
실용적 함수: 실제 데이터 로드 및 실시간 이상 감지를 위한 함수도 포함되어 있습니다.

실제 프로젝트에서는 샘플 데이터 생성 부분을 실제 온도 데이터 로드 코드로 대체하면 됩니다. 또한 threshold 값이나 LSTM 네트워크의 구조를 데이터 특성에 맞게 조정할 수 있습니다.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import math

# 1. 데이터 준비 (샘플 데이터 생성 - 실제로는 CSV 파일 등에서 로드)
# 온도 데이터 샘플 생성 (실제 구현 시 이 부분을 데이터 로드 코드로 대체)
np.random.seed(42)
dates = pd.date_range(start='2023-01-01', periods=1000, freq='H')
base_temp = 5  # 기본 온도
seasonal_temp = 3 * np.sin(np.linspace(0, 4*np.pi, 1000))  # 계절성
daily_temp = 2 * np.sin(np.linspace(0, 80*np.pi, 1000))  # 일간 변동
noise = 0.5 * np.random.randn(1000)  # 랜덤 노이즈
temperature = base_temp + seasonal_temp + daily_temp + noise

# 데이터프레임 생성
df = pd.DataFrame({'datetime': dates, 'temperature': temperature})
df.set_index('datetime', inplace=True)

# 데이터 시각화
plt.figure(figsize=(12, 6))
plt.plot(df.index, df['temperature'])
plt.title('온도 데이터 시계열')
plt.xlabel('시간')
plt.ylabel('온도 (°C)')
plt.grid(True)
plt.tight_layout()
plt.savefig('temperature_timeseries.png')

# 2. 데이터 전처리
# 데이터 정규화
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(df.values)

# 시퀀스 데이터 생성 함수
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length])
        y.append(data[i + seq_length])
    return np.array(X), np.array(y)

# 시퀀스 길이 설정 (24시간 = 1일)
sequence_length = 24

# 시퀀스 데이터 생성
X, y = create_sequences(scaled_data, sequence_length)

# 훈련/테스트 데이터 분리 (80% 훈련, 20% 테스트)
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# LSTM 입력 형태로 변환
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

print(f"훈련 데이터 형태: {X_train.shape}")
print(f"테스트 데이터 형태: {X_test.shape}")

# 3. LSTM 모델 구축
model = Sequential([
    LSTM(units=50, return_sequences=True, input_shape=(sequence_length, 1)),
    Dropout(0.2),
    LSTM(units=50),
    Dropout(0.2),
    Dense(1)
])

# 모델 컴파일
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()

# 조기 종료 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# 모델 학습
history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping],
    verbose=1
)

# 4. 모델 평가 및 예측
# 학습 과정 시각화
plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('모델 학습 과정')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('training_history.png')

# 테스트 데이터로 예측
y_pred = model.predict(X_test)

# 역정규화
y_test_inv = scaler.inverse_transform(y_test)
y_pred_inv = scaler.inverse_transform(y_pred)

# 예측 결과 시각화
plt.figure(figsize=(12, 6))
plt.plot(df.index[train_size+sequence_length:], y_test_inv, label='실제 온도')
plt.plot(df.index[train_size+sequence_length:], y_pred_inv, label='예측 온도')
plt.title('LSTM 온도 예측 결과')
plt.xlabel('시간')
plt.ylabel('온도 (°C)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('prediction_results.png')

# RMSE 계산
rmse = math.sqrt(mean_squared_error(y_test_inv, y_pred_inv))
print(f'Root Mean Squared Error (RMSE): {rmse:.4f}°C')

# 5. 이상 감지 (예측값과 실제값의 차이가 큰 경우 이상으로 판단)
threshold = rmse * 2  # RMSE의 2배를 임계값으로 설정
errors = np.abs(y_test_inv - y_pred_inv)
anomalies = errors > threshold

# 이상 감지 결과 시각화
plt.figure(figsize=(12, 6))
plt.plot(df.index[train_size+sequence_length:], y_test_inv, label='실제 온도')
plt.plot(df.index[train_size+sequence_length:], y_pred_inv, label='예측 온도')
plt.scatter(df.index[train_size+sequence_length:][anomalies.flatten()], 
            y_test_inv[anomalies.flatten()], 
            color='red', label='감지된 이상', s=50)
plt.title('온도 데이터 이상 감지 결과')
plt.xlabel('시간')
plt.ylabel('온도 (°C)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('anomaly_detection.png')

print(f'감지된 이상 포인트 수: {np.sum(anomalies)}/{len(anomalies)}')

# 6. 미래 예측 (다음 24시간 예측)
def predict_future(model, last_sequence, n_future=24):
    future_predictions = []
    current_sequence = last_sequence.copy()
    
    for _ in range(n_future):
        # 현재 시퀀스로 다음 값 예측
        current_sequence_reshaped = current_sequence.reshape(1, sequence_length, 1)
        next_pred = model.predict(current_sequence_reshaped)[0]
        
        # 예측값 저장
        future_predictions.append(next_pred[0])
        
        # 시퀀스 업데이트 (가장 오래된 값 제거하고 새 예측값 추가)
        current_sequence = np.append(current_sequence[1:], next_pred)
    
    # 역정규화
    future_predictions = scaler.inverse_transform(np.array(future_predictions).reshape(-1, 1))
    return future_predictions

# 마지막 시퀀스 가져오기
last_sequence = scaled_data[-sequence_length:]

# 미래 24시간 예측
future_pred = predict_future(model, last_sequence)

# 미래 예측 시각화
future_dates = pd.date_range(start=df.index[-1], periods=25, freq='H')[1:]

plt.figure(figsize=(12, 6))
plt.plot(df.index[-100:], scaler.inverse_transform(scaled_data[-100:]), label='과거 온도')
plt.plot(future_dates, future_pred, label='미래 예측', color='green', linestyle='--')
plt.title('미래 24시간 온도 예측')
plt.xlabel('시간')
plt.ylabel('온도 (°C)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('future_prediction.png')

# 7. 모델 저장
model.save('temperature_lstm_model.h5')
print("모델이 'temperature_lstm_model.h5'로 저장되었습니다.")

# 실제 구현 시 사용할 온도 데이터 로드 함수 (참고용)
def load_temperature_data(file_path):
    """
    CSV 파일에서 온도 데이터를 로드하는 함수
    
    Parameters:
    file_path (str): 온도 데이터 CSV 파일 경로
    
    Returns:
    DataFrame: 로드된 온도 데이터
    """
    df = pd.read_csv(file_path, parse_dates=['datetime'])
    df.set_index('datetime', inplace=True)
    return df

# 실시간 이상 탐지 함수 (참고용)
def detect_anomaly_realtime(model, new_data, scaler, sequence_length, threshold):
    """
    새로운 데이터 포인트에 대한 실시간 이상 탐지 함수
    
    Parameters:
    model: 훈련된 LSTM 모델
    new_data (array): 최근 sequence_length+1 개의 데이터 포인트
    scaler: 훈련에 사용된 스케일러
    sequence_length (int): 시퀀스 길이
    threshold (float): 이상 판단 임계값
    
    Returns:
    bool: 이상 여부 (True: 이상, False: 정상)
    float: 예측 오차
    """
    # 데이터 정규화
    scaled_data = scaler.transform(new_data.reshape(-1, 1))
    
    # 입력 시퀀스와 실제값 분리
    input_seq = scaled_data[:-1].reshape(1, sequence_length, 1)
    actual = scaled_data[-1]
    
    # 예측
    pred = model.predict(input_seq)[0]
    
    # 역정규화
    actual_inv = scaler.inverse_transform(actual.reshape(-1, 1))[0][0]
    pred_inv = scaler.inverse_transform(pred.reshape(-1, 1))[0][0]
    
    # 오차 계산
    error = abs(actual_inv - pred_inv)
    
    # 이상 여부 판단
    is_anomaly = error > threshold
    
    return is_anomaly, error

728x90

저작자표시 비영리 변경금지