1. Creating Datasets From CSV Files
==================================
# Import module needed to read csv's
from csv import reader
# Import goole csv and assign dataset to variable 'google/apple_dataset'
opened_google_file = open('googleplaystore.csv')
opened_apple_file = open('AppleStore.csv')
read_google_file = reader(opened_google_file)
read_apple_file = reader(opened_apple_file)
google_dataset = list(read_google_file)
apple_dataset = list(read_apple_file)
# seperate header from rest of data in the dataset
google_header = google_dataset[0]
apple_header = apple_dataset[0]
google_dataset = google_dataset[1:]
apple_dataset = apple_dataset[1:]
#Check to see if the both data sets are Lists
print("google_dataset type is a list of a", type(google_dataset))
print("apple_dataset type is a list of a", type(apple_dataset),'\n\n')
# We can use this function(created by dq), to periodically check
# our datasets to make sure all matches up
def explore_data(dataset, start, end, rows_and_columns=True):
dataset_slice = dataset[start:end]
for row in dataset_slice:
print(row)
print('\n') # adds a new (empty) line after each row
if rows_and_columns:
print('Number of rows:', len(dataset))
print('Number of columns:', len(dataset[0]))
# For Google MarketPlace
print("Google MarketPlace")
print("==================")
google_mplace_data = explore_data(google_dataset, 0, 3)
print("\n")
# print(google_mplace_data)
# For AppStore
print("App Store")
print("=========")
app_store_data = explore_data(apple_dataset, 0, 3)
print("\n\n")
# Google dataset column names
print("Google Columns\n==============\n", google_header,'\n\n')
# Apple dataset column names
print("Apple Columns\n=============\n", apple_header,'\n\n')
# Removes errors from each dataset
def del_rows_w_errs(dataset):
if len(dataset[0]) == len(google_header):
# For google dataset
g_count = 0
for row in dataset:
if len(row) != len(google_header):
print(row)
bad_row_num = google_dataset.index(row)
print("Row number ", bad_row_num, "had an error and is deleted")
del google_dataset[bad_row_num]
g_count += 1
return "For google_dataset, there was a total of", g_count, "errors"
else:
# For apple dataset
a_count = 0
for row in dataset:
if len(row) != len(apple_header):
print(row)
bad_row_num = apple_dataset.index(row)
print("Row number ", bad_row_num, "had an error and is deleted")
del apple_dataset[bad_row_num]
a_count += 1
return "For apple_dataset, there was a total of", a_count, "errors"
print(del_rows_w_errs(google_dataset))
print(del_rows_w_errs(apple_dataset))
google_ds_no_errs = google_dataset
apple_ds_no_errs = apple_dataset
print(len(google_ds_no_errs))
print(len(apple_ds_no_errs))
print('\n')
print(google_ds_no_errs == google_dataset)
print(apple_ds_no_errs == apple_dataset)
print("Working google dataset = 'google_ds_no_errs'")
print("Working apple dataset = 'apple_ds_no_errs'")
# seperating duplicates from google data
duplicate_google_apps = []
unique_google_apps = []
duplicate_apple_apps = []
unique_apple_apps = []
def dedup(dataset):
if len(dataset[0]) == len(google_header):
for row in google_ds_no_errs:
name = row[0]
if name in unique_google_apps:
duplicate_google_apps.append(name)
else:
unique_google_apps.append(name)
else:
# seperating duplicates from apple data
for row in apple_ds_no_errs:
name = row[1]
if name in unique_apple_apps:
duplicate_apple_apps.append(name)
else:
unique_apple_apps.append(name)
dedup(google_ds_no_errs)
dedup(apple_ds_no_errs)
google_ds_no_errs_ddup = unique_google_apps
apple_ds_no_errs_ddup = unique_apple_apps
print("\nGoogle Marketplace")
print("==================")
print("Duplicate Apps -", len(duplicate_google_apps))
print("Unique Apps -",len(google_ds_no_errs_ddup))
print("Total Apps -", len(google_ds_no_errs),"\n")
print("\nAppStore")
print("========")
print("Duplicate Apps -", len(duplicate_apple_apps))
print("Unique Apps -",len(apple_ds_no_errs_ddup))
print("Total Apps -", len(apple_ds_no_errs),"\n")
new_google_ds_no_errs_ddup = []
google_dups = []
new_apple_ds_no_errs_ddup = []
apple_dups = []
### Step 1 - create dict with unique keys ####
def ddup_dict(dataset):
if len(dataset[0]) == len(google_header):
reviews_max = {}
for app in google_ds_no_errs:
name = app[0]
n_reviews = float(app[3])
if name in reviews_max and reviews_max[name] > n_reviews:
reviews_max[name] = n_reviews
elif name not in reviews_max:
reviews_max[name] = n_reviews
print('Found', len(google_ds_no_errs) - len(reviews_max), 'duplicate/s')
# Inspect the dictionary to make sure everything went as expected.
# Measure length of the dictionary — expected length = 9,659
print("length of reviews_max dict for google dataset is", len(reviews_max))
### Step 2 - use dict to create new dataset with unique entries ####
# Using the dictionary I created above to remove the duplicate rows
# I will create a new dataset which is void of errors and duplicates
for app in google_ds_no_errs:
name = app[0]
n_reviews = float(app[3])
if n_reviews == reviews_max[name] and name not in google_dups:
new_google_ds_no_errs_ddup.append(app)
google_dups.append(name)
elif len(dataset[0]) == len(apple_header):
reviews_max = {}
for app in apple_ds_no_errs:
name = app[1]
n_reviews = float(app[5])
if name in reviews_max and reviews_max[name] > n_reviews:
reviews_max[name] = n_reviews
elif name not in reviews_max:
reviews_max[name] = n_reviews
print('Found', len(apple_ds_no_errs) - len(reviews_max), 'duplicate/s')
print("length of reviews_max dict for apple dataset is", len(reviews_max))
for app in apple_ds_no_errs:
name = app[1]
n_reviews = float(app[5])
if n_reviews == reviews_max[name] and name not in apple_dups:
new_apple_ds_no_errs_ddup.append(app)
apple_dups.append(name)
ddup_dict(google_ds_no_errs)
ddup_dict(apple_ds_no_errs)
print("length of 'new_google_ds_no_errs_ddup' is",len(new_google_ds_no_errs_ddup))
print("length of 'new_apple_ds_no_errs_ddup' is",len(new_apple_ds_no_errs_ddup))
explore_data(new_google_ds_no_errs_ddup, 0, 3, True)
explore_data(new_apple_ds_no_errs_ddup, 0, 3, True)
print("Working google dataset = 'new_google_ds_no_errs_ddup'")
print("Working apple dataset = 'new_apple_ds_no_errs_ddup'")
# Write a function that takes in a string and returns False if
# there's any character in the string that doesn't belong to the
# set of common English characters, otherwise it returns True.
def english_or_not(string):
for char in string:
if ord(char) > 127:
return False
return True
print(english_or_not('Instagram'))
print(english_or_not('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(english_or_not('Docs To Go™ Free Office Suite'))
print(english_or_not('Instachat 😜'))
# This is a function that takes in a string and returns False if
# there any 3 characters in the string do not belong to the
# set of common English characters, otherwise it returns True.
# This is a better way than above as it's based on a larger sampling
def english_or_not(string):
non_english = 0
for character in string:
if ord(character) > 127:
non_english += 1
if non_english > 3:
return "not english"
else:
return "english"
print(english_or_not('Docs To Go™ Free Office Suite'))
print(english_or_not('Instachat 😜'))
print(english_or_not('爱奇艺PPS -《欢乐颂2》电视剧热播'))
# Using the function above, this function will segregate english
# and non_english apps for any dataset into seperate lists
google_ds_no_errs_ddup_eng = []
google_non_eng_apps = []
apple_ds_no_errs_ddup_eng = []
apple_non_eng_apps = []
def eng_only(dataset):
if len(dataset[0]) == len(google_header):
for row in dataset:
if english_or_not(row[0]) == 'english':
google_ds_no_errs_ddup_eng.append(row)
else:
google_non_eng_apps.append(row)
print("There were", len(google_non_eng_apps), "non english google apps")
elif len(dataset[0]) == len(apple_header):
for row in dataset:
if english_or_not(row[1]) == 'english':
apple_ds_no_errs_ddup_eng.append(row)
else:
apple_non_eng_apps.append(row)
print("There were", len(apple_non_eng_apps), "non english apple apps")
eng_only(new_google_ds_no_errs_ddup)
eng_only(new_apple_ds_no_errs_ddup)
print("length of 'google_ds_no_errs_ddup_eng' is",len(google_ds_no_errs_ddup_eng))
print("length of 'apple_ds_no_errs_ddup_eng' is",len(apple_ds_no_errs_ddup_eng))
explore_data(google_ds_no_errs_ddup_eng, 0, 3, True)
explore_data(apple_ds_no_errs_ddup_eng, 0, 3, True)
print("Working google dataset = 'google_ds_no_errs_ddup_eng'")
print("Working apple dataset = 'apple_ds_no_errs_ddup_eng'")
print(google_header,'\n')
print(apple_header,'\n')
print('Using the header from the original datasets, we can see that the price column is at:\n')
print('index 7 for Google Marketplace\n and\nindex 4 for App Store')
google_ds_final = []
apple_ds_final = []
def free_apps(dataset):
if len(dataset[0]) == len(google_header):
for app in dataset:
price = app[7]
if price == '0':
google_ds_final.append(app)
elif len(dataset[0]) == len(apple_header):
for app in dataset:
price = app[4]
if price == '0.0':
apple_ds_final.append(app)
free_apps(google_ds_no_errs_ddup_eng)
free_apps(apple_ds_no_errs_ddup_eng)
print("google_ds_final =",len(google_ds_final),"apps")
print("apple_ds_final =",len(apple_ds_final),"apps")
print("Working google dataset = 'google_ds_final'")
print("Working apple dataset = 'apple_ds_final'")
To begin analysis, we need to identify which columns in our dataset would give us the best insight into the most common genres in each market. We could then use the columns to generate frequency tables
Google - Category[1], Genres[9]
Apple - prime_genre[11]
print("google - " + google_header[1]+"[1] and " + google_header[9]+"[9]")
print("apple - " + apple_header[11]+"[11]")
from pprint import pprint
# Makes sure the variable 'index' is n integer
def check_int(integer):
if type(integer) == str and integer.isdigit() == False:
x = input("type a number please ")
check_int(x)
else:
x = int(integer)
return x
# creates frequency table for any column we choose from dataset
def freq_table(dataset,index):
frequency_table = {}
total_num_of_apps = len(dataset)
if check_int(index) == index:
for row in dataset:
if row[index] in frequency_table:
frequency_table[row[index]] += 1
else:
frequency_table[row[index]] = 1
# displays frequency table in percentages
for item in frequency_table:
frequency_table[item]/= total_num_of_apps
frequency_table[item] = (format(frequency_table[item]*100, '.2f')+"%")
return frequency_table
results = freq_table(google_ds_final,1)
print("google - Categories")
print("===================")
pprint(results)
print(len(results))
# uses function above to sort percentages in descending order
def display_table(dataset, index):
table = freq_table(dataset, index)
table_display = []
for key in table:
key_val_as_tuple = (table[key], key)
table_display.append(key_val_as_tuple)
table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
print(entry[1], ':', entry[0])
print("GOOGLE GENRE COLUMN [9]")
print("====================")
google_genre_results = (display_table(google_ds_final, 9))
print("\nGOOGLE CATEGORY COLUMN [1]")
print("=======================")
google_category_results = (display_table(google_ds_final, 1))
print("\nAPPLE PRIME GENRE COLUMN [11]")
print("=========================")
apple_genre_results = (display_table(apple_ds_final, 11))
# Function to calculate the popularity of each genre in the apple dataset
# by percentage. It uses a for loop inside of another for loop. (nested)
def freq_table(dataset, index):
freq_table = {}
total = 0
for row in dataset:
total += 1
app = row[index]
if app in freq_table:
freq_table[app] += 1
else:
freq_table[app] = 1
freq_table_percentages = {}
for genre in freq_table:
percentage = (format((freq_table[genre] / total) * 100, '.2f')+"%")
freq_table_percentages[genre] = percentage
freq_table_display = []
for key in freq_table_percentages:
key_val_as_tuple = (freq_table_percentages[key], key)
freq_table_display.append(key_val_as_tuple)
freq_table_sorted = sorted(freq_table_display, reverse = True)
sorted_list = []
for entry in freq_table_sorted:
element = entry[1], ':', entry[0]
sorted_list.append(element)
return sorted_list
freq_table(apple_ds_final,11)
print("google installs column [5]")
print("==========================")
google_installs_results = (display_table(google_ds_final, 5))
# Now I'll calculate the average number of installs per app genre for the
# google data set using a nested loop.
cat_google = freq_table(google_ds_final, 1)
for category in cat_google:
category = category[0]
total = 0
len_category = 0
for app in google_ds_final:
category_app = app[1]
if category_app == category:
num_installs = app[5]
num_installs = num_installs.replace(',', '')
num_installs = num_installs.replace('+', '')
total += float(num_installs)
len_category += 1
avg_n_installs = total / len_category
print(category, ':', avg_n_installs)
This concludes my first project in Dataquest. Based on the above, it seems that 'Games' is the best direction for developing an app that shows the most potential for being profitable