import pandas as pd
import matplotlib.pyplot as plt


airbnb = pd.read_csv("AirBnB_NYC_2019.csv")

print("The dimension of the table is: ", airbnb.shape)

The dimension of the table is:  (48895, 16)


print("Number of listings (observations/rows): ", len(airbnb))

Number of listings (observations/rows):  48895


print("Column Names:\n",airbnb.head(0))

Column Names:
 Empty DataFrame
Columns: [id, name, host_id, host_name, neighbourhood_group, neighbourhood, latitude, longitude, room_type, price, minimum_nights, number_of_reviews, last_review, reviews_per_month, calculated_host_listings_count, availability_365]
Index: []


print("Column Names:\n",airbnb.head(2))

Column Names:
      id                                name  host_id host_name  \
0  2539  Clean & quiet apt home by the park     2787      John   
1  2595               Skylit Midtown Castle     2845  Jennifer   

  neighbourhood_group neighbourhood  latitude  longitude        room_type  \
0            Brooklyn    Kensington  40.64749  -73.97237     Private room   
1           Manhattan       Midtown  40.75362  -73.98377  Entire home/apt   

   price  minimum_nights  number_of_reviews last_review  reviews_per_month  \
0    149               1                  9  2018-10-19               0.21   
1    225               1                 45  2019-05-21               0.38   

   calculated_host_listings_count  availability_365  
0                               6               365  
1                               2               355


airbnb.describe()


num_stats = airbnb.describe()
print("Statistics for 'price' column:\n",num_stats['price'])

Statistics for 'price' column:
 count    48895.000000
mean       152.720687
std        240.154170
min          0.000000
25%         69.000000
50%        106.000000
75%        175.000000
max      10000.000000
Name: price, dtype: float64


airbnb.describe(include=['O'])


cat_stats = airbnb.describe(include=['O'])
print("Statistics for neighbourhood_group:\n", cat_stats['neighbourhood_group'])

Statistics for neighbourhood_group:
 count         48895
unique            5
top       Manhattan
freq          21661
Name: neighbourhood_group, dtype: object


cat_stats = airbnb.describe(include=['O'])
print("Statistics for last_review:\n", cat_stats['last_review'])

Statistics for last_review:
 count          38843
unique          1764
top       2019-06-23
freq            1413
Name: last_review, dtype: object


airbnb["last_review"] = pd.to_datetime(airbnb["last_review"].apply(str))
print("Latest date for last_review: ", airbnb['last_review'].max())

Latest date for last_review:  2019-07-08 00:00:00


print("Number of dataponts with null entry for each column:\n",airbnb.isnull().sum())

Number of dataponts with null entry for each column:
 id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64


print("Minimum number of reviews: ", airbnb['number_of_reviews'].min())

Minimum number of reviews:  0


airbnb['number_of_reviews'].value_counts()

0      10052
1       5244
2       3465
3       2520
4       1994
       ...  
404        1
436        1
388        1
341        1
607        1
Name: number_of_reviews, Length: 394, dtype: int64


airbnb.fillna({'reviews_per_month':0, 'last_review':0},inplace=True)


airbnb.fillna({'host_name':"", 'name':""},inplace=True)


print(airbnb.isnull().sum())

id                                0
name                              0
host_id                           0
host_name                         0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64


airbnb.drop(['host_id','host_name'], axis=1, inplace=True)


print("Column Names:\n",airbnb.head(0))

Column Names:
 Empty DataFrame
Columns: [id, name, neighbourhood_group, neighbourhood, latitude, longitude, room_type, price, minimum_nights, number_of_reviews, last_review, reviews_per_month, calculated_host_listings_count, availability_365]
Index: []


print("Unique Neighborhood Names:\n", airbnb.neighbourhood_group.unique())

Unique Neighborhood Names:
 ['Brooklyn' 'Manhattan' 'Queens' 'Staten Island' 'Bronx']


boro_group = airbnb.groupby(['neighbourhood_group'])


boro_group['price'].mean().plot.bar()

<AxesSubplot:xlabel='neighbourhood_group'>


#create more space for x-axis labels
plt.gcf().subplots_adjust(bottom=0.25)
plt.ylabel('Average Price')
#get the current figure
fig = plt.gcf()
#save figure to an image file
fig.savefig('meanPrice.png')

#clear figure to generate next pyplot
plt.clf()

<Figure size 432x288 with 0 Axes>


boro_group['minimum_nights'].mean().plot.bar()

<AxesSubplot:xlabel='neighbourhood_group'>


#label the y-axis
plt.ylabel('Average Minimum Number of Nights')
#get the current figure
fig2 = plt.gcf()
#save figure to an image file
fig2.savefig('meanMinNights.png')

#clear figure to generate next pyplot
plt.clf()

<Figure size 432x288 with 0 Axes>


boro_group['minimum_nights'].max().plot.bar()

<AxesSubplot:xlabel='neighbourhood_group'>


#label the y-axis
plt.ylabel('Max Minimum Number of Nights')
#get the current figure
fig2 = plt.gcf()
#save figure to an image file
fig2.savefig('maxMinNights.png')
#clear figure to generate next pyplot
plt.clf()

<Figure size 432x288 with 0 Axes>


#Select only listings in Staten Island
st = airbnb[airbnb['neighbourhood_group'] == 'Staten Island']
print("Number of listings in Staten Island: ", len(st))

Number of listings in Staten Island:  373

	id	host_id	latitude	longitude	price	minimum_nights	number_of_reviews	reviews_per_month	calculated_host_listings_count	availability_365
count	4.889500e+04	4.889500e+04	48895.000000	48895.000000	48895.000000	48895.000000	48895.000000	38843.000000	48895.000000	48895.000000
mean	1.901714e+07	6.762001e+07	40.728949	-73.952170	152.720687	7.029962	23.274466	1.373221	7.143982	112.781327
std	1.098311e+07	7.861097e+07	0.054530	0.046157	240.154170	20.510550	44.550582	1.680442	32.952519	131.622289
min	2.539000e+03	2.438000e+03	40.499790	-74.244420	0.000000	1.000000	0.000000	0.010000	1.000000	0.000000
25%	9.471945e+06	7.822033e+06	40.690100	-73.983070	69.000000	1.000000	1.000000	0.190000	1.000000	0.000000
50%	1.967728e+07	3.079382e+07	40.723070	-73.955680	106.000000	3.000000	5.000000	0.720000	1.000000	45.000000
75%	2.915218e+07	1.074344e+08	40.763115	-73.936275	175.000000	5.000000	24.000000	2.020000	2.000000	227.000000
max	3.648724e+07	2.743213e+08	40.913060	-73.712990	10000.000000	1250.000000	629.000000	58.500000	327.000000	365.000000

HC7 & HC8 -- Data Analysis¶

EXPLORE THE DATA¶

CLEAN THE DATA¶

VISUALIZE THE DATA¶

Let's look at the data by borough:¶

YOUR TURN!¶

	name	host_name	neighbourhood_group	neighbourhood	room_type	last_review
count	48879	48874	48895	48895	48895	38843
unique	47905	11452	5	221	3	1764
top	Hillside Hotel	Michael	Manhattan	Williamsburg	Entire home/apt	2019-06-23
freq	18	417	21661	3920	25409	1413