Rinse and repeat#

The visit_data.csv file contains data generated by a random data generator, and contains fake people’s names, their gender, email_id, ip_address, and the number of visits they made to a particular web page.

Importing libraries and packages#

1# Mathematical operations and data manipulation
2import numpy as np
3import pandas as pd
4
5# Plotting
6import matplotlib.pyplot as plt
7
8%matplotlib inline

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Loading dataset#

1dataset = pd.read_csv(f"{data_path}/visit_data.csv")
2dataset.head()
id first_name last_name email gender ip_address visit
0 1 Sonny Dahl sdahl0@mysql.com Male 135.36.96.183 1225.0
1 2 NaN NaN dhoovart1@hud.gov NaN 237.165.194.143 919.0
2 3 Gar Armal garmal2@technorati.com NaN 166.43.137.224 271.0
3 4 Chiarra Nulty cnulty3@newyorker.com NaN 139.98.137.108 1002.0
4 5 NaN NaN sleaver4@elegantthemes.com NaN 46.117.117.27 2434.0

Wrangling#

1print(
2    "First name is duplicated - {}".format(
3        any(dataset.first_name.duplicated())
4    )
5)
6print(
7    "Last name is duplicated - {}".format(any(dataset.last_name.duplicated()))
8)
9print("Email is duplicated - {}".format(any(dataset.email.duplicated())))
First name is duplicated - True
Last name is duplicated - True
Email is duplicated - False
 1# Different ways to format boolean values for the % op-erator
 2print(
 3    "The column Email contains NaN - %r " % dataset.email.isnull().values.any()
 4)
 5print(
 6    "The column IP Address contains NaN - %s "
 7    % dataset.ip_address.isnull().values.any()
 8)
 9print(
10    "The column Visit contains NaN - %s " % dataset.visit.isnull().values.any()
11)
The column Email contains NaN - False 
The column IP Address contains NaN - False 
The column Visit contains NaN - True 
 1# numpy.isfinite tests element-wise for finiteness (not infinity and not
 2# Not a Number).
 3# Store the previous size of the dataset for comparison with the new size
 4size_prev = dataset.shape
 5df = dataset[np.isfinite(dataset["visit"])]  # This is an inplace operation.
 6size_after = df.shape
 7
 8print(
 9    "The size of previous data was - {prev[0]} rows and the size of the "
10    "new one is - {after[0]} rows".format(prev=size_prev, after=size_after)
11)
The size of previous data was - 1000 rows and the size of the new one is - 974 rows
1plt.boxplot(df.visit, notch=True)
{'whiskers': [<matplotlib.lines.Line2D at 0x7fbfbde1f610>,
  <matplotlib.lines.Line2D at 0x7fbfbde1f8e0>],
 'caps': [<matplotlib.lines.Line2D at 0x7fbfbde1fc70>,
  <matplotlib.lines.Line2D at 0x7fbfbde1fe80>],
 'boxes': [<matplotlib.lines.Line2D at 0x7fbfbde12220>],
 'medians': [<matplotlib.lines.Line2D at 0x7fbfbde3e190>],
 'fliers': [<matplotlib.lines.Line2D at 0x7fbfbde3e460>],
 'means': []}
../../_images/36a1e7a8be3479756a91d560a25af6f6f6154da01ae93af1f83932a2eaa5d45a.png
1# Abusing the fact the number of variables can be greater than the
2# number of replacement targets
3df1 = df[(df["visit"] <= 2900) & (df["visit"] >= 100)]
4print(
5    "After getting rid of outliers the new size of the data "
6    "is - {}".format(*df1.shape)
7)
After getting rid of outliers the new size of the data is - 923