강얼쥐와 함께 즐겁게 읽는 AI

BEER_BEER_is_that_what_you_want? 🦫

영웅*^%&$ 2023. 6. 4. 14:20
728x90
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import random

# Load and preprocess data
df = pd.read_csv(
    "/kaggle/input/ml-olympiad-can-you-guess-beer-style/train.csv"
)

# Select columns
df = df[
    [
        "Description",
        "ABV",
        "Min IBU",
        "Max IBU",
        "Astringency",
        "Body",
        "Alcohol",
        "Bitter",
        "Sweet",
        "Sour",
        "Salty",
        "Fruits",
        "Hoppy",
        "Spices",
        "Malty",
        "review_aroma",
        "review_appearance",
        "review_palate",
        "review_taste",
        "review_overall",
        "number_of_reviews",
        "Style",
    ]
]

# Select numeric columns only
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()

# Fill NaN for numeric columns
imp_mean = SimpleImputer(missing_values=np.nan, strategy="median")
df[numeric_cols] = imp_mean.fit_transform(df[numeric_cols])

# For non-numeric columns (in this case "Description"), you could fill with an empty string or some other strategy
df["Description"].fillna("", inplace=True)

# Split dataset
train_set, val_set, test_set = np.split(
    df.sample(frac=1, random_state=42),
    [int(len(df) * 0.8), int(len(df) * 0.9)],
)

# Process the description
vectorizer = TextVectorization(max_tokens=5000, output_sequence_length=200)
train_text = train_set['Description']
vectorizer.adapt(train_text.values)

# Create datasets
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('Style')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe), seed=42)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

train_ds = df_to_dataset(train_set)
val_ds = df_to_dataset(val_set)
test_ds = df_to_dataset(test_set)

# Define model
def create_model():
    # text input branch
    text_input = tf.keras.Input(shape=(), dtype=tf.string, name='Description')
    x = vectorizer(text_input)
    x = tf.keras.layers.Embedding(
        input_dim=5000, output_dim=64, mask_zero=True)(x)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    text_output = tf.keras.layers.Dense(64, activation='relu')(x)

    # numeric input branch
    numeric_inputs = [tf.keras.Input(shape=(1,), name=c) for c in train_set.drop(columns=['Description', 'Style']).columns]
    x = tf.keras.layers.concatenate(numeric_inputs)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    numeric_output = tf.keras.layers.Dense(64, activation='relu')(x)

    # concatenate text and numeric branches
    concatenated = tf.keras.layers.concatenate([text_output, numeric_output])

    output = tf.keras.layers.Dense(len(df['Style'].unique()), activation='softmax')(concatenated)

    model = tf.keras.models.Model([text_input] + numeric_inputs, output)
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model


# Train the model
model = create_model()
history = model.fit(train_ds, validation_data=val_ds, epochs=10, callbacks=[
                    tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)])

# Evaluate the model
model.evaluate(test_ds)

# Visualize training history
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper right')
plt.show()

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.show()
Epoch 1/10
13/13 [==============================] - 21s 432ms/step - loss: 257.0126 - accuracy: 0.0507 - val_loss: 151.4959 - val_accuracy: 0.0000e+00
Epoch 2/10
13/13 [==============================] - 3s 216ms/step - loss: 156.8023 - accuracy: 0.0628 - val_loss: 121.1608 - val_accuracy: 0.0192
Epoch 3/10
13/13 [==============================] - 3s 207ms/step - loss: 119.4563 - accuracy: 0.1039 - val_loss: 103.4732 - val_accuracy: 0.0385
Epoch 4/10
13/13 [==============================] - 2s 183ms/step - loss: 101.0891 - accuracy: 0.0870 - val_loss: 95.3936 - val_accuracy: 0.0769
Epoch 5/10
13/13 [==============================] - 2s 178ms/step - loss: 87.5844 - accuracy: 0.1014 - val_loss: 98.3966 - val_accuracy: 0.0962
Epoch 6/10
13/13 [==============================] - 2s 146ms/step - loss: 80.8935 - accuracy: 0.1014 - val_loss: 96.4117 - val_accuracy: 0.0577
Epoch 7/10
13/13 [==============================] - 3s 200ms/step - loss: 64.1204 - accuracy: 0.1111 - val_loss: 92.1796 - val_accuracy: 0.1346
Epoch 8/10
13/13 [==============================] - 2s 121ms/step - loss: 64.1571 - accuracy: 0.1473 - val_loss: 87.7057 - val_accuracy: 0.1154
Epoch 9/10
13/13 [==============================] - 2s 149ms/step - loss: 60.5621 - accuracy: 0.1184 - val_loss: 92.3790 - val_accuracy: 0.0769
Epoch 10/10
13/13 [==============================] - 2s 130ms/step - loss: 49.4580 - accuracy: 0.1449 - val_loss: 98.1194 - val_accuracy: 0.0962
2/2 [==============================] - 0s 16ms/step - loss: 52.7078 - accuracy: 0.1538
In [2]:
# Assuming `test_set` is your test dataset
preds = model.predict(df_to_dataset(test_set, shuffle=False, batch_size=1))
52/52 [==============================] - 3s 8ms/step
In [3]:
# Create DataFrame for submission
submission_df = pd.DataFrame()
submission_df['ID'] = test_set.index  # Use index as 'ID'
submission_df['Style'] = tf.argmax(preds, axis=-1).numpy()  # Assign predictions to 'Style' column

# Save DataFrame to csv file
submission_df.to_csv('submission.csv', index=False)
728x90