Data Preparation

%matplotlib inline

import pandas
from sqlalchemy import create_engine

Download the data

url = ("http://data.insideairbnb.com/france/ile-de-france/paris"
       "/2019-09-16/data/listings.csv.gz")
%time db = pandas.read_csv(url)
<string>:2: DtypeWarning: Columns (43,61,62) have mixed types. Specify dtype option on import or set low_memory=False.
CPU times: user 9.02 s, sys: 4.61 s, total: 13.6 s
Wall time: 2min 22s
tokeep = [
    "id", 
    "neighbourhood_cleansed",
    "property_type",
    "room_type",
    "accommodates",
    "bathrooms",
    "bedrooms",
    "beds",
    "bed_type"
    ]
review = [i for i in db.columns if "review_scores" in i]

Build numeric price

cleaner = lambda p: float(p.replace(",", "").strip("$"))
db["Price"] = db["price"].apply(cleaner)

Keep relevant variables

final = db[tokeep + review + ["Price"]].dropna()

Write out

  • As .csv

! rm -f /paris_abb.csv.zip
final[tokeep + ["Price"]].to_csv("paris_abb.csv.zip",
                                 index=False)
  • As .sqlite

! rm -f /paris_abb_mini.db
engine = create_engine("sqlite:///paris_abb_mini.db")
final[tokeep + ["Price"]].to_sql("db", engine, index=False)

Index on neighbourhood_cleansed

qry = """
CREATE INDEX idx_neighbourhood_cleansed
ON db (neighbourhood_cleansed);
"""
conn = engine.connect()
conn.execute(qry)
<sqlalchemy.engine.result.ResultProxy at 0x7fb1c2f75fd0>
out = conn.execute("select * from SQLite_master")

for table in out.fetchall():
    print(table[4])
CREATE TABLE db (
	id BIGINT, 
	neighbourhood_cleansed TEXT, 
	property_type TEXT, 
	room_type TEXT, 
	accommodates BIGINT, 
	bathrooms FLOAT, 
	bedrooms FLOAT, 
	beds FLOAT, 
	bed_type TEXT, 
	"Price" FLOAT
)
CREATE INDEX idx_neighbourhood_cleansed
ON db (neighbourhood_cleansed)
conn.close()
engine.dispose()
  • Full table dump on sqlite (optional, final file was eventually removed from the pack as 270MB)

! rm -f paris_abb.db
engine = create_engine("sqlite:///paris_abb.db")
db.rename(columns={"Price": "price_val"})\
  .to_sql("db", engine, index=False)

Index on neighbourhood_cleansed

qry = """
CREATE INDEX idx_neighbourhood_cleansed
ON db (neighbourhood_cleansed);
"""
conn = engine.connect()
conn.execute(qry)
<sqlalchemy.engine.result.ResultProxy at 0x7fb1aa334860>
out = conn.execute("select * from SQLite_master")

for table in out.fetchall():
    print(table[4])
CREATE TABLE db (
	id BIGINT, 
	listing_url TEXT, 
	scrape_id BIGINT, 
	last_scraped TEXT, 
	name TEXT, 
	summary TEXT, 
	space TEXT, 
	description TEXT, 
	experiences_offered TEXT, 
	neighborhood_overview TEXT, 
	notes TEXT, 
	transit TEXT, 
	access TEXT, 
	interaction TEXT, 
	house_rules TEXT, 
	thumbnail_url FLOAT, 
	medium_url FLOAT, 
	picture_url TEXT, 
	xl_picture_url FLOAT, 
	host_id BIGINT, 
	host_url TEXT, 
	host_name TEXT, 
	host_since TEXT, 
	host_location TEXT, 
	host_about TEXT, 
	host_response_time TEXT, 
	host_response_rate TEXT, 
	host_acceptance_rate FLOAT, 
	host_is_superhost TEXT, 
	host_thumbnail_url TEXT, 
	host_picture_url TEXT, 
	host_neighbourhood TEXT, 
	host_listings_count FLOAT, 
	host_total_listings_count FLOAT, 
	host_verifications TEXT, 
	host_has_profile_pic TEXT, 
	host_identity_verified TEXT, 
	street TEXT, 
	neighbourhood TEXT, 
	neighbourhood_cleansed TEXT, 
	neighbourhood_group_cleansed FLOAT, 
	city TEXT, 
	state TEXT, 
	zipcode TEXT, 
	market TEXT, 
	smart_location TEXT, 
	country_code TEXT, 
	country TEXT, 
	latitude FLOAT, 
	longitude FLOAT, 
	is_location_exact TEXT, 
	property_type TEXT, 
	room_type TEXT, 
	accommodates BIGINT, 
	bathrooms FLOAT, 
	bedrooms FLOAT, 
	beds FLOAT, 
	bed_type TEXT, 
	amenities TEXT, 
	square_feet FLOAT, 
	price TEXT, 
	weekly_price TEXT, 
	monthly_price TEXT, 
	security_deposit TEXT, 
	cleaning_fee TEXT, 
	guests_included BIGINT, 
	extra_people TEXT, 
	minimum_nights BIGINT, 
	maximum_nights BIGINT, 
	minimum_minimum_nights BIGINT, 
	maximum_minimum_nights BIGINT, 
	minimum_maximum_nights BIGINT, 
	maximum_maximum_nights BIGINT, 
	minimum_nights_avg_ntm FLOAT, 
	maximum_nights_avg_ntm FLOAT, 
	calendar_updated TEXT, 
	has_availability TEXT, 
	availability_30 BIGINT, 
	availability_60 BIGINT, 
	availability_90 BIGINT, 
	availability_365 BIGINT, 
	calendar_last_scraped TEXT, 
	number_of_reviews BIGINT, 
	number_of_reviews_ltm BIGINT, 
	first_review TEXT, 
	last_review TEXT, 
	review_scores_rating FLOAT, 
	review_scores_accuracy FLOAT, 
	review_scores_cleanliness FLOAT, 
	review_scores_checkin FLOAT, 
	review_scores_communication FLOAT, 
	review_scores_location FLOAT, 
	review_scores_value FLOAT, 
	requires_license TEXT, 
	license TEXT, 
	jurisdiction_names TEXT, 
	instant_bookable TEXT, 
	is_business_travel_ready TEXT, 
	cancellation_policy TEXT, 
	require_guest_profile_picture TEXT, 
	require_guest_phone_verification TEXT, 
	calculated_host_listings_count BIGINT, 
	calculated_host_listings_count_entire_homes BIGINT, 
	calculated_host_listings_count_private_rooms BIGINT, 
	calculated_host_listings_count_shared_rooms BIGINT, 
	reviews_per_month FLOAT, 
	price_val FLOAT
)
CREATE INDEX idx_neighbourhood_cleansed
ON db (neighbourhood_cleansed)
conn.close()
engine.dispose()
  • Review scores

! rm -f paris_abb_review.csv.zip
final[["id"] + review].to_csv("paris_abb_review.csv.zip",
                              index=False)