{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Dataset Bug Detection\n", "\n", "In this example, we will demonstrate how to detect bugs in a data set using the public Airlines data set." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Since we use the category_encoders library to perform binary encoding on some of the features in this demo, \n", "# we'll need to install it.\n", "!pip install category_encoders" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", " | year | \n", "month | \n", "day | \n", "dep_time | \n", "sched_dep_time | \n", "dep_delay | \n", "arr_time | \n", "sched_arr_time | \n", "arr_delay | \n", "carrier | \n", "... | \n", "dest | \n", "air_time | \n", "distance | \n", "hour | \n", "minute | \n", "time_hour | \n", "date | \n", "day_index | \n", "DayOfWeek | \n", "Month | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "2013 | \n", "1 | \n", "1 | \n", "517.0 | \n", "515 | \n", "2.0 | \n", "830.0 | \n", "819 | \n", "11.0 | \n", "UA | \n", "... | \n", "IAH | \n", "227.0 | \n", "1400 | \n", "5 | \n", "15 | \n", "1/1/2013 5:00 | \n", "2013-01-01 | \n", "0 | \n", "Tuesday | \n", "January | \n", "
1 | \n", "2013 | \n", "1 | \n", "1 | \n", "533.0 | \n", "529 | \n", "4.0 | \n", "850.0 | \n", "830 | \n", "20.0 | \n", "UA | \n", "... | \n", "IAH | \n", "227.0 | \n", "1416 | \n", "5 | \n", "29 | \n", "1/1/2013 5:00 | \n", "2013-01-01 | \n", "0 | \n", "Tuesday | \n", "January | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
336774 | \n", "2013 | \n", "9 | \n", "30 | \n", "NaN | \n", "1159 | \n", "NaN | \n", "NaN | \n", "1344 | \n", "NaN | \n", "MQ | \n", "... | \n", "CLE | \n", "NaN | \n", "419 | \n", "11 | \n", "59 | \n", "30-09-2013 11:00 | \n", "2013-09-30 | \n", "272 | \n", "Monday | \n", "September | \n", "
336775 | \n", "2013 | \n", "9 | \n", "30 | \n", "NaN | \n", "840 | \n", "NaN | \n", "NaN | \n", "1020 | \n", "NaN | \n", "MQ | \n", "... | \n", "RDU | \n", "NaN | \n", "431 | \n", "8 | \n", "40 | \n", "30-09-2013 08:00 | \n", "2013-09-30 | \n", "272 | \n", "Monday | \n", "September | \n", "
336776 rows × 23 columns
\n", "\n", " | year | \n", "month | \n", "day | \n", "dep_time | \n", "sched_dep_time | \n", "dep_delay | \n", "arr_time | \n", "sched_arr_time | \n", "arr_delay | \n", "carrier | \n", "... | \n", "dest | \n", "air_time | \n", "distance | \n", "hour | \n", "minute | \n", "time_hour | \n", "date | \n", "day_index | \n", "DayOfWeek | \n", "Month | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
101780 | \n", "2013 | \n", "12 | \n", "21 | \n", "2.0 | \n", "2359 | \n", "3.0 | \n", "445.0 | \n", "445 | \n", "0.0 | \n", "B6 | \n", "... | \n", "PSE | \n", "206.0 | \n", "1617 | \n", "23 | \n", "59 | \n", "21-12-2013 23:00 | \n", "2013-12-21 | \n", "354 | \n", "Saturday | \n", "December | \n", "
101781 | \n", "2013 | \n", "12 | \n", "21 | \n", "29.0 | \n", "2040 | \n", "229.0 | \n", "138.0 | \n", "2220 | \n", "198.0 | \n", "WN | \n", "... | \n", "MDW | \n", "117.0 | \n", "725 | \n", "20 | \n", "40 | \n", "21-12-2013 20:00 | \n", "2013-12-21 | \n", "354 | \n", "Saturday | \n", "December | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
111294 | \n", "2013 | \n", "12 | \n", "31 | \n", "NaN | \n", "600 | \n", "NaN | \n", "NaN | \n", "735 | \n", "NaN | \n", "UA | \n", "... | \n", "ORD | \n", "NaN | \n", "719 | \n", "6 | \n", "0 | \n", "31-12-2013 06:00 | \n", "2013-12-31 | \n", "364 | \n", "Tuesday | \n", "December | \n", "
111295 | \n", "2013 | \n", "12 | \n", "31 | \n", "NaN | \n", "830 | \n", "NaN | \n", "NaN | \n", "1154 | \n", "NaN | \n", "UA | \n", "... | \n", "LAX | \n", "NaN | \n", "2475 | \n", "8 | \n", "30 | \n", "31-12-2013 08:00 | \n", "2013-12-31 | \n", "364 | \n", "Tuesday | \n", "December | \n", "
9516 rows × 23 columns
\n", "