import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
def read_data_from_csv():
hotels=pd.read_csv('zomato.csv')
return hotels
Task 1 : Removing Unwanted Columns
Removing unwanted columns refers to the process of eliminating irrelevant or unnecessary columns from a dataset. This can improve data analysis and visualization by reducing clutter and focusing on the most important information. It involves identifying and selecting the columns to be removed and executing the removal process using tools like programming languages, database management systems, or spreadsheet software.
def remove_unwanted_columns():
#DO NOT REMOVE FOLLOWING LINE
#call read_data_from_csv() function to get dataframe
hotels=read_data_from_csv()
hotels=hotels[['name','online_order','book_table','rate','votes','location','rest_type','dish_liked','cuisines','approx_cost(for two people)','listed_in(type)']]
return hotels
Task 2 : Renaming and Selecting Columns in a Dataset
Renaming columns involves changing the names of one or more columns in a dataset to make them more meaningful or consistent. Selecting columns refers to the process of choosing only specific columns to be included in a dataset, while excluding all others. These techniques are useful for improving the organization and readability of data and can help streamline data analysis. By renaming and selecting only the relevant columns, data scientists can create a more focused and manageable dataset that is better suited for their specific analysis needs.
def rename_columns():
#DO NOT REMOVE FOLLOWING LINE
#call remove_unwanted_columns() function to get dataframe
hotels = remove_unwanted_columns()
#task2: rename columns, only these columns are allowed in the dataset
# 1. Id
# 2. Name
# 3. online_order
# 4. book_table
# 5. rating
# 6. votes
# 7. location
# 8. rest_type
# 9. dish_liked
# 10. cuisines
# 11. approx_cost
# 12. type
hotels.rename(columns={'rate':'rating','approx_cost(for two people)':'approx_cost','listed_in(type)':'type'},inplace=True)
return hotels
Task 3 : Dealing with Null Values in a Dataset
Handling null values refers to the process of identifying and addressing missing or incomplete data in each column of a dataset. This involves using techniques like imputation, where missing values are replaced with estimated values based on other data, or deletion, where incomplete records are removed entirely. Proper handling of null values is critical for accurate data analysis and can help prevent bias and errors in results.
#task3: handle null values of each column
def null_value_check():
#DO NOT REMOVE FOLLOWING LINE
#call rename_columns() function to get dataframe
hotels=rename_columns()
#deleting null values of name column
#hotels=hotels.dropna(subset=['name'],axis=0)
hotels['name'].fillna('NA',inplace=True)
#handling null values of online_order
hotels['online_order'].fillna('NA',inplace=True)
#handling null values of book_table
hotels['book_table'].fillna('NA',inplace=True)
#handling null values of rating
hotels['rating'].fillna('', inplace=True)
#handling null values of votes
hotels['votes'].fillna(0, inplace=True)
#handling null values of location
hotels['location'].fillna('NA',inplace=True)
#handling null values of rest_type
hotels['rest_type'].fillna('NA',inplace=True)
#handling null values of dish_liked
hotels['dish_liked'].fillna('NA',inplace=True)
#handling null values of cuisines
hotels['cuisines'].fillna('NA',inplace=True)
#handling null values of approx_cost
hotels['approx_cost'].fillna(0,inplace=True)
#handling null values of type
hotels['type'].fillna('NA',inplace=True)
return hotels
Task 4 : Identifying Duplicate Data in a Dataset
Finding duplicates in a dataset refers to the process of identifying records that are identical or nearly identical to one another. Duplicate data can skew analysis results and waste computational resources, so it is important to identify and remove duplicates before analyzing data. This can be achieved using algorithms that compare records and identify common attributes, or through manual inspection of the dataset.
Hint:
drop all the duplicate values keeping the first value as it is
#task4 #find duplicates in the dataset
def find_duplicates():
#DO NOT REMOVE FOLLOWING LINE
#call null_value_check() function to get dataframe
hotels=null_value_check()
hotels.drop_duplicates(inplace=True)
#droping the duplicates value keeping the first
return hotels
Task 5 : Text Cleaning
Text cleaning refers to the process of removing irrelevant or unnecessary text from all the columns in a dataset. This is an essential step in data preprocessing and analysis, as it ensures that the data is accurate and reliable. Text cleaning can involve tasks such as removing stopwords, punctuation, and special characters, as well as correcting spelling and grammar errors.
Hint:
we have irrelevant reviews like string eg(RATED,Rated) in our name,online_order etc columns
remove this irrelevant text from all the columns
#task5 removing irrelevant text from all the columns
def removing_irrelevant_text():
#DO NOT REMOVE FOLLOWING LINE
#call find_duplicates() function to get dataframe
hotels= find_duplicates()
#hotels = hotels[~hotels.astype(str).apply(lambda x: x.str.contains('RATED|rated|Rated')).any(axis=1)]
for col in hotels.columns:
hotels = hotels[hotels[col].str.contains('RATED|Rated') == False]
return hotels
Task 6 : Unique Value Check and Irrelevant Value Handling
The process of examining each column in a dataset to identify and handle any irrelevant data, while also verifying the uniqueness of values within each column. This helps ensure data accuracy and integrity in analysis and decision-making.
Hint:
online order column should have only yes and no because it is necessary to have the online order as yes or no only for zomato to perform further analysis, remove other values
check for rating column and remove NEW,- values to 0 and remove /5 as rating column should only contain decimal values
#task6: check for unique values in each column and handle the irrelevant values
def check_for_unique_values():
#DO NOT REMOVE FOLLOWING LINE
#call removing_irrelevant_text() function to get dataframe
hotels=removing_irrelevant_text()
hotels = hotels.query("online_order == 'Yes' or online_order == 'No'")
hotels = hotels.query("book_table == 'Yes' or book_table == 'No'")
#hotels['rating'] = hotels['rating'].str.replace('[^a-zA-Z0-9\s]', '')
hotels['rating'] = hotels['rating'].str.replace('/5', '').replace('NEW', 0).replace('-',0).replace(np.nan,0)
return hotels
Task 7 : Cleaning and Exporting Zomato Dataset
The process of cleaning the Zomato dataset by removing any unknown or unidentifiable characters and exporting the cleaned dataset to a new file named "zomatocleaned.csv". This involves identifying and removing any symbols, special characters, or non-standard characters that may interfere with proper data analysis. By exporting the cleaned dataset to a new file, the original dataset can be preserved and the cleaned data can be easily accessed for further analysis and decision-making.
Hint:
remove the unknown character from the dataset, we have à charachter in our names column
#task7: remove the unknown character from the dataset and export it to "zomatocleaned.csv"
def remove_the_unknown_character():
#DO NOT REMOVE FOLLOWING LINE
#call check_for_unique_values() function to get dataframe
dataframe=check_for_unique_values()
#remove unknown character from dataset
dataframe['name'] = dataframe['name'].str.replace('[Ãx][^A-Za-z]+','',regex=True)
#export cleaned Dataset to newcsv file named "zomatocleaned.csv"
dataframe.to_csv('zomatocleaned.csv')
return dataframe
remove_the_unknown_character().to_csv('zomatocleaned.csv')
#check if mysql table is created using "zomatocleaned.csv"
#Use this final dataset and upload it on the provided database for performing analysis in MySQL
#To Run this task first Run the appliation for Terminal to create table named 'Zomato' and then run test.
def start():
remove_the_unknown_character()
def task_runner():
start()
SQL FOR DATA ANALYSIS
import pandas as pd
import sqlite3
#Defining a function which takes SQL queries and prints the output,
def SQL(dataframe_csv,query):
df=dataframe_csv
# create a connection to the database
conn = sqlite3.connect('zomato.db')
# convert the DataFrame to a SQLite database table
df.to_sql('zomato_table', conn, if_exists='replace', index=False, index_label=list(df.columns))
result_df = pd.read_sql_query(query, conn)
conn.close()
return result_df
df = pd.read_csv('zomatocleaned.csv') # read the CSV file into a pandas DataFrame
df=df.iloc[:,1:]
query="SELECT * FROM zomato_table LIMIT 10"
SQL(df,query)
name | online_order | book_table | rating | votes | location | rest_type | dish_liked | cuisines | approx_cost | type | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Jalsa | Yes | Yes | 4.1 | 775 | Banashankari | Casual Dining | Pasta, Lunch Buffet, Masala Papad, Paneer Laja... | North Indian, Mughlai, Chinese | 800 | Buffet |
1 | Spice Elephant | Yes | No | 4.1 | 787 | Banashankari | Casual Dining | Momos, Lunch Buffet, Chocolate Nirvana, Thai G... | Chinese, North Indian, Thai | 800 | Buffet |
2 | San Churro Cafe | Yes | No | 3.8 | 918 | Banashankari | Cafe, Casual Dining | Churros, Cannelloni, Minestrone Soup, Hot Choc... | Cafe, Mexican, Italian | 800 | Buffet |
3 | Addhuri Udupi Bhojana | No | No | 3.7 | 88 | Banashankari | Quick Bites | Masala Dosa | South Indian, North Indian | 300 | Buffet |
4 | Grand Village | No | No | 3.8 | 166 | Basavanagudi | Casual Dining | Panipuri, Gol Gappe | North Indian, Rajasthani | 600 | Buffet |
5 | Timepass Dinner | Yes | No | 3.8 | 286 | Basavanagudi | Casual Dining | Onion Rings, Pasta, Kadhai Paneer, Salads, Sal... | North Indian | 600 | Buffet |
6 | Rosewood International Hotel - Bar & Restaurant | No | No | 3.6 | 8 | Mysore Road | Casual Dining | None | North Indian, South Indian, Andhra, Chinese | 800 | Buffet |
7 | Onesta | Yes | Yes | 4.6 | 2556 | Banashankari | Casual Dining, Cafe | Farmhouse Pizza, Chocolate Banana, Virgin Moji... | Pizza, Cafe, Italian | 600 | Cafes |
8 | Penthouse Cafe | Yes | No | 4.0 | 324 | Banashankari | Cafe | Pizza, Mocktails, Coffee, Nachos, Salad, Pasta... | Cafe, Italian, Continental | 700 | None |
9 | Smacznego | Yes | No | 4.2 | 504 | Banashankari | Cafe | Waffles, Pasta, Coleslaw Sandwich, Choco Waffl... | Cafe, Mexican, Italian, Momos, Beverages | 550 | Cafes |
1. For a high-level overview of the hotels, provide us the top 5 most voted hotels in the delivery category.
query="select name,votes,rating from zomato_table where type='Delivery' order by votes desc limit 5;"
SQL(df,query)
name | votes | rating | |
---|---|---|---|
0 | Empire Restaurant | 8286 | 4.1 |
1 | Church Street Social | 7544 | 4.3 |
2 | Meghana Foods | 7233 | 4.4 |
3 | Empire Restaurant | 6948 | 4.1 |
4 | Empire Restaurant | 4884 | 4.4 |
2. The rating of a hotel is a key identifier in determining a restaurant’s performance. Hence for a particular location called Banashankari find out the top 5 highly rated hotels in the delivery category.
#write your query
query="Select name, rating, location, type from zomato_table where location='Banashankari'and type='Delivery' order by rating desc limit 5;"
SQL(df,query)
name | rating | location | type | |
---|---|---|---|---|
0 | Onesta | 4.6 | Banashankari | Delivery |
1 | Onesta | 4.6 | Banashankari | Delivery |
2 | Corner House Ice Cream | 4.3 | Banashankari | Delivery |
3 | Stoned Monkey | 4.3 | Banashankari | Delivery |
4 | Ayodhya Upachar | 4.3 | Banashankari | Delivery |
3. Compare the ratings of the cheapest and most expensive hotels in Indiranagar.
query="select (select rating from zomato_table where location='Indiranagar' order by approx_cost limit 1) as rating1, (select rating from zomato_table where location='Indiranagar' order by approx_cost desc limit 1) as rating2 from zomato_table limit 1;"
SQL(df,query)
rating1 | rating2 | |
---|---|---|
0 | 4.3 | 4.1 |
4. Online ordering of food has exponentially increased over time. Compare the total votes of restaurants that provide online ordering services and those who don’t provide online ordering service.
#write your query
query="SELECT online_order, SUM(votes) as total_votes FROM zomato_table GROUP BY online_order;"
SQL(df,query)
online_order | total_votes | |
---|---|---|
0 | No | 3922869 |
1 | Yes | 6796488 |
5. Number of votes defines how much the customers are involved with the service provided by the restaurants For each Restaurant type, find out the number of restaurants, total votes, and average rating. Display the data with the highest votes on the top( if the first row of output is NA display the remaining rows).
#write your query
query="select type,count(name),sum(votes),avg(rating) from zomato_table group by type having type!='NA' order by sum(votes) desc limit 7;"
SQL(df,query)
type | count(name) | sum(votes) | avg(rating) | |
---|---|---|---|---|
0 | Delivery | 15014 | 2739517 | 3.464533 |
1 | Dine-out | 11626 | 2625344 | 3.476512 |
2 | Drinks & nightlife | 580 | 447949 | 3.817133 |
3 | Buffet | 517 | 436637 | 3.877583 |
4 | Cafes | 1018 | 347871 | 3.643672 |
5 | Desserts | 2291 | 264266 | 3.586687 |
6 | Pubs and bars | 421 | 251245 | 3.758435 |
6. What is the most liked dish of the most-voted restaurant on Zomato(as the restaurant has a tie-up with Zomato, the restaurant compulsorily provides online ordering and delivery facilities.
#write your query
query="SELECT name, dish_liked, rating, votes FROM zomato_table WHERE rating = (SELECT MAX(rating) FROM zomato_table) AND votes = (SELECT MAX(votes) FROM zomato_table) LIMIT 1;"
SQL(df,query)
name | dish_liked | rating | votes | |
---|---|---|---|---|
0 | Byg Brewski Brewing Company | Cocktails, Butter Chicken, Dahi Kebab, Rajma C... | 4.9 | 16832 |
7. To increase the maximum profit, Zomato is in need to expand its business. For doing so Zomato wants the list of the top 15 restaurants which have min 150 votes, have a rating greater than 3, and is currently not providing online ordering. Display the restaurants with highest votes on the top.
#write your query
query="SELECT name, rating, votes ,online_order from zomato_table WHERE online_order = 'No' AND rating > 3 AND votes >= 150 ORDER BY votes DESC LIMIT 15;"
SQL(df,query)
name | rating | votes | online_order | |
---|---|---|---|---|
0 | Toit | 4.7 | 14956 | No |
1 | Toit | 4.7 | 14956 | No |
2 | Truffles | 4.7 | 14726 | No |
3 | Truffles | 4.7 | 14723 | No |
4 | Truffles | 4.7 | 14717 | No |
5 | Truffles | 4.7 | 14710 | No |
6 | Truffles | 4.7 | 14704 | No |
7 | Truffles | 4.7 | 14694 | No |
8 | Truffles | 4.7 | 14690 | No |
9 | Truffles | 4.7 | 14654 | No |
10 | Truffles | 4.7 | 14654 | No |
11 | Truffles | 4.7 | 14654 | No |
12 | AB's - Absolute Barbecues | 4.8 | 12121 | No |
13 | The Black Pearl | 4.7 | 10550 | No |
14 | The Black Pearl | 4.7 | 10547 | No |
pd.read_csv('zomatocleaned.csv')
Unnamed: 0 | name | online_order | book_table | rating | votes | location | rest_type | dish_liked | cuisines | approx_cost | type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | Jalsa | Yes | Yes | 4.1 | 775 | Banashankari | Casual Dining | Pasta, Lunch Buffet, Masala Papad, Paneer Laja... | North Indian, Mughlai, Chinese | 800 | Buffet |
1 | 1 | Spice Elephant | Yes | No | 4.1 | 787 | Banashankari | Casual Dining | Momos, Lunch Buffet, Chocolate Nirvana, Thai G... | Chinese, North Indian, Thai | 800 | Buffet |
2 | 2 | San Churro Cafe | Yes | No | 3.8 | 918 | Banashankari | Cafe, Casual Dining | Churros, Cannelloni, Minestrone Soup, Hot Choc... | Cafe, Mexican, Italian | 800 | Buffet |
3 | 3 | Addhuri Udupi Bhojana | No | No | 3.7 | 88 | Banashankari | Quick Bites | Masala Dosa | South Indian, North Indian | 300 | Buffet |
4 | 4 | Grand Village | No | No | 3.8 | 166 | Basavanagudi | Casual Dining | Panipuri, Gol Gappe | North Indian, Rajasthani | 600 | Buffet |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
34048 | 56247 | Best Brews - Four Points by Sheraton Bengaluru... | No | No | 3.6 | 27 | Whitefield | Bar | NaN | Continental | 1,500 | Pubs and bars |
34049 | 56248 | Vinod Bar And Restaurant | No | No | NaN | 0 | Whitefield | Bar | NaN | Finger Food | 600 | Pubs and bars |
34050 | 56249 | Plunge - Sheraton Grand Bengaluru Whitefield H... | No | No | NaN | 0 | Whitefield | Bar | NaN | Finger Food | 2,000 | Pubs and bars |
34051 | 56250 | Chime - Sheraton Grand Bengaluru Whitefield Ho... | No | Yes | 4.3 | 236 | ITPL Main Road, Whitefield | Bar | Cocktails, Pizza, Buttermilk | Finger Food | 2,500 | Pubs and bars |
34052 | 56251 | The Nest - The Den Bengaluru | No | No | 3.4 | 13 | ITPL Main Road, Whitefield | Bar, Casual Dining | NaN | Finger Food, North Indian, Continental | 1,500 | Pubs and bars |
34053 rows × 12 columns
df=pd.read_csv('zomatocleaned.csv')
zomato=pd.DataFrame()
zomato['location']=df['location'].unique()
zomato
location | |
---|---|
0 | Banashankari |
1 | Basavanagudi |
2 | Mysore Road |
3 | Jayanagar |
4 | Kumaraswamy Layout |
... | ... |
88 | West Bangalore |
89 | Magadi Road |
90 | Yelahanka |
91 | Sahakara Nagar |
92 | Peenya |
93 rows × 1 columns
# import required libraries
from geopy.geocoders import Nominatim
import pandas as pd
from tqdm import tqdm
# create a geolocator object
geolocator = Nominatim(user_agent="zomato_app")
# define a function to get the latitude and longitude of a location
def get_lat_long(location):
try:
# use geolocator to get the latitude and longitude of the location
geo_location = geolocator.geocode(location+", Bangalore")
return geo_location.latitude, geo_location.longitude
except:
return None, None
latitude_list = []
longitude_list = []
for location in tqdm(zomato['location']):
latitude, longitude = get_lat_long(location)
latitude_list.append(latitude)
longitude_list.append(longitude)
zomato['latitude'] = latitude_list
zomato['longitude'] = longitude_list
# print the updated dataframe
zomato
100%|██████████████████████████████████████████████████████████████████████████████████| 93/93 [00:48<00:00, 1.91it/s]
location | latitude | longitude | |
---|---|---|---|
0 | Banashankari | 12.915221 | 77.573598 |
1 | Basavanagudi | 12.941726 | 77.575502 |
2 | Mysore Road | 12.946662 | 77.530090 |
3 | Jayanagar | 12.929273 | 77.582423 |
4 | Kumaraswamy Layout | 12.908149 | 77.555318 |
... | ... | ... | ... |
88 | West Bangalore | 12.984852 | 77.540063 |
89 | Magadi Road | 12.975608 | 77.555356 |
90 | Yelahanka | 13.100698 | 77.596345 |
91 | Sahakara Nagar | 13.062147 | 77.580061 |
92 | Peenya | 13.032942 | 77.527325 |
93 rows × 3 columns
df = df.merge(zomato[['location', 'longitude','latitude']], left_on='location', right_on='location', how='left')
df=df.iloc[:,1:]
df.to_csv('zomato1.csv',index=False)
df.drop_duplicates(inplace=True)
Extracting some extra data from dataframe and storing it into dataframe regarding it's type.
x=pd.DataFrame()
x['dish_liked']=df['dish_liked'].value_counts().index
x['count']=df['dish_liked'].value_counts().values
dish_liked = x['dish_liked']
# Split the strings into lists
dish_liked = dish_liked.str.split(', ')
# Explode the lists into separate rows
dish_liked = dish_liked.explode()
# Count the occurrences of each item
counts = dish_liked.value_counts()
Dishes=pd.DataFrame()
Dishes["Dish"]=counts.index
Dishes['Count']=counts.values
Dishes
Dish | Count | |
---|---|---|
0 | Pasta | 681 |
1 | Burgers | 618 |
2 | Pizza | 567 |
3 | Cocktails | 531 |
4 | Biryani | 458 |
... | ... | ... |
2785 | Chicken Pepper Fry | 1 |
2786 | Patata Bravas | 1 |
2787 | Coffee Toffee | 1 |
2788 | Mala Chicken | 1 |
2789 | Papaya Juice | 1 |
2790 rows × 2 columns
Dishes.to_csv('dishes_liked.csv')
y=pd.DataFrame()
y['rest_type']=df['rest_type'].value_counts().index
y['count']=df['rest_type'].value_counts().values
rest_type = y['rest_type']
# Split the strings into lists
rest_type = rest_type.str.split(', ')
# Explode the lists into separate rows
rest_type = rest_type .explode()
# Count the occurrences of each item
counts = rest_type .value_counts()
Rest_type=pd.DataFrame()
Rest_type["Rest_type"]=counts.index
Rest_type['Count']=counts.values
rest_type.to_csv('rest_type.csv',index=False)
pd.read_csv('rest_type.csv')
rest_type | |
---|---|
0 | Quick Bites |
1 | Casual Dining |
2 | Cafe |
3 | Delivery |
4 | Dessert Parlor |
... | ... |
158 | Quick Bites |
159 | Dessert Parlor |
160 | Food Court |
161 | Sweet Shop |
162 | Dessert Parlor |
163 rows × 1 columns
z=pd.DataFrame()
z['cuisines']=df['cuisines'].value_counts().index
z['count']=df['cuisines'].value_counts().values
cuisines = z['cuisines']
# Split the strings into lists
cuisines = cuisines.str.split(', ')
# Explode the lists into separate rows
cuisines = cuisines.explode()
# Count the occurrences of each item
counts = cuisines.value_counts()
cuisines=pd.DataFrame()
cuisines["cuisines"]=counts.index
cuisines['Count']=counts.values
cuisines.to_csv('cuisines.csv',index=False)
pd.read_csv('cuisines.csv')
cuisines | Count | |
---|---|---|
0 | North Indian | 1205 |
1 | Chinese | 1023 |
2 | Fast Food | 554 |
3 | Continental | 541 |
4 | South Indian | 495 |
... | ... | ... |
102 | Sindhi | 1 |
103 | Russian | 1 |
104 | Bohri | 1 |
105 | Cantonese | 1 |
106 | Malwani | 1 |
107 rows × 2 columns