Use the H20 AutoML: Automatic Machine Learning toolkit to classify Glioblastomas using the database Data_Glioblastoma5Patients_SC.csv and evaluate the performance, discuss the results.
code and csv is provided. so i did most of the work. just revise the code and make it work with the Data_Glioblastoma5Patients_SC.csv inJupiter notebook.
{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "4e37649f", "metadata": {}, "outputs": [], "source": [ "import h2o\n", "from h2o.automl import H2OAutoML\n", "\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "ac821c50", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Checking whether there is an H2O instance running at http://localhost:54321 . connected.\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
H2O_cluster_uptime: |
1 hour 11 mins |
H2O_cluster_timezone: |
America/Chicago |
H2O_data_parsing_timezone: |
UTC |
H2O_cluster_version: |
3.32.1.3 |
H2O_cluster_version_age: |
3 days
|
H2O_cluster_name: |
H2O_from_python_jennifernwogu_ilnoqf |
H2O_cluster_total_nodes: |
1 |
H2O_cluster_free_memory: |
458.5 Mb |
H2O_cluster_total_cores: |
4 |
H2O_cluster_allowed_cores: |
4 |
H2O_cluster_status: |
locked, healthy |
H2O_connection_url: |
http://localhost:54321 |
H2O_connection_proxy: |
{\"http\": null, \"https\": null} |
H2O_internal_security: |
False |
H2O_API_Extensions: |
Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4 |
Python_version: |
3.8.8 final |
" ], "text/plain": [ "-------------------------- ------------------------------------------------------------------\n", "H2O_cluster_uptime: 1 hour 11 mins\n", "H2O_cluster_timezone: America/Chicago\n", "H2O_data_parsing_timezone: UTC\n", "H2O_cluster_version: 3.32.1.3\n", "H2O_cluster_version_age: 3 days\n", "H2O_cluster_name: H2O_from_python_jennifernwogu_ilnoqf\n", "H2O_cluster_total_nodes: 1\n", "H2O_cluster_free_memory: 458.5 Mb\n", "H2O_cluster_total_cores: 4\n", "H2O_cluster_allowed_cores: 4\n", "H2O_cluster_status: locked, healthy\n", "H2O_connection_url: http://localhost:54321\n", "H2O_connection_proxy: {\"http\": null, \"https\": null}\n", "H2O_internal_security: False\n", "H2O_API_Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4\n", "Python_version: 3.8.8 final\n", "-------------------------- ------------------------------------------------------------------" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Parse progress: |█████████████████████████████████████████████████████████| 100%\n", "Parse progress: |█████████████████████████████████████████████████████████| 100%\n" ] } ], "source": [ "h2o.init()\n", "\n", "# Import a sample binary outcome train/test set into H2O\n", "train = h2o.import_file(\"https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv\")\n", "test = h2o.import_file(\"https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv\")\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "9de5a992", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Identify predictors and response\n", "x = train.columns\n", "y = \"response\"\n", "x.remove(y)\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "47ee65d6", "metadata": {}, "outputs": [], "source": [ "\n", "# For binary classification, response should be a factor\n", "train[y] = train[y].asfactor()\n", "test[y] = test[y].asfactor()" ] }, { "cell_type": "code", "execution_count": 5, "id": "101f1881", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AutoML progress: |████████████████████████████████████████████████████████| 100%\n" ] } ], "source": [ "# Run AutoML for 20 base models (limited to 1 hour max runtime by default)\n", "aml = H2OAutoML(max_models=20, seed=1)\n", "aml.train(x=x, y=y, training_frame=train)" ] }, { "cell_type": "code", "execution_count": 6, "id": "c06878d6", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
model_id
|
auc |
logloss |
aucpr |
mean_per_class_error |
rmse |
mse |
---|
StackedEnsemble_AllModels_AutoML_20210522_195449
|
0.788938 |
0.550169 |
0.807374 |
0.324269 |
0.432087 |
0.186699 |
StackedEnsemble_BestOfFamily_AutoML_20210522_195449 |
0.787794 |
0.551334 |
0.806123 |
0.325839 |
0.432641 |
0.187178 |
GBM_5_AutoML_20210522_195449
|
0.78219
|
0.558353 |
0.801738 |
0.319658 |
0.435512 |
0.18967
|
GBM_2_AutoML_20210522_195449
|
0.777673 |
0.562514 |
0.796364 |
0.334056 |
0.437583 |
0.191479 |
GBM_1_AutoML_20210522_195449
|
0.777294 |
0.562744 |
0.799184 |
0.356261 |
0.437727 |
0.191605 |
GBM_3_AutoML_20210522_195449
|
0.775488 |
0.564794 |
0.794892 |
0.327971 |
0.438722 |
0.192477 |
GBM_grid__1_AutoML_20210522_195449_model_1
|
0.772926 |
0.568181 |
0.791195 |
0.322808 |
0.439997 |
0.193598 |
GBM_4_AutoML_20210522_195449
|
0.77248
|
0.569483 |
0.792582 |
0.336913 |
0.440873 |
0.194369 |
GBM_grid__1_AutoML_20210522_195449_model_2
|
0.77049
|
0.569351 |
0.788633 |
0.369523 |