User Types	Values
Vegetarian	12812
Flexitarian	26522
Carnivore	107134

6 Code

# Build Data
def prepare_data():
    ''' this function loads and merges the instacart data
    that is split across a number of different csv files.
    The codebook can found here:
    https://gist.github.com/jeremystan/c3b39d947d9b88b3ccff3147dbcf6c6b '''

    # load necessary CSVs
    order_products_train = pd.read_csv('order_products__train.csv') # contains order/product info
    order_products_prior = pd.read_csv('order_products__prior.csv') # contains order/product info
    orders = pd.read_csv('orders.csv') # contains variables about orders
    products = pd.read_csv('products.csv') # contains variables about range of products
    departments = pd.read_csv('departments.csv') # categorizes products by department
    aisles = pd.read_csv('aisles.csv') # categorizes products by aisle

    # combine orders_product_train/prior into single dataframe
    order_products = pd.concat([order_products_train, order_products_prior])

    # merge in all other datasets
    df = orders.merge(order_products, on = ['order_id'])
    df = df.merge(products, on = ['product_id'])
    df = df.merge(departments, on = ['department_id'])
    df = df.merge(aisles, on = ['aisle_id'])

    return df # return df for analysis

df = prepare_data() # instantiate df; will consume ~4gb of RAM

# Find most ordered products, by aisle; filter out non-food products

# List aisle_ids:
# 6 - other; 10 - kitchen supplies; 11 - cold flu allergy; 20 - oral hygiene
# 22 - hair care; 25 - soap; 40 - dog food care; 41 - cat food care;
# 44 - eye ear care; 46 - mint gum; 54 - paper goods; 55 - shave needs
# 56 - diapers wipes; 60 - trash bag liners; 70 - digestion; 73 - facial care
# 74 - dish detergents, 75 - laundry
# 80 - deodorants; 82 - baby accessories; 85 - food storage; 87 - more households
# 92 - baby food formula; 97 - baking supplies decor; 100 - missing;
# 101 - air fresheners candles; 102 - baby bath body care; 109 - skin care;
# 111 - plates bowls cups flatware; 114 - cleaning products; 118 - first aid
# 126 - feminine care; 127 - body lotions soap; 132 - beauty; 133 - muscles joint pain relief

# Create a aisle_id filter list
filter_list = [6, 10, 11, 20, 22, 25, 40, 41, 44, 46, 54, 55, 56, 60, 70, 73,
74, 75, 80, 82, 85, 87, 92, 97, 100, 101, 102, 109, 111, 114, 118, 126,
127, 132, 133]

# Filter the df - return rows that are not in the filter list
filtered_df = df.loc[~df['aisle_id'].apply(lambda x: x in filter_list)]

# Get the top 10 products to collect nutrient statistics on
top_products_by_aisle = filtered_df.groupby(['aisle_id', 'aisle'])['product_name'].value_counts()
top_products_by_aisle = top_products_by_aisle.reset_index(name="count")

# Get the top 10 by using head
top_products_by_aisle = top_products_by_aisle.groupby('aisle_id').head(10)

# Search each entry in top_products_by_aisle by USDA database through API
def fdcID_retrieval():
    ''' This function finds the FDCID for each of the top 10 most
    commonly ordered items by aisle. It does so by capturing the first record
    from the USDA database, FoodData Central (https://fdc.nal.usda.gov/).
    The first record is the the most likely object based on USDA's search
    algorithm. This function could be improved by making another function
    which is executed prior to this function that that pulls the top
    10 or so results and then does fuzzy matching to find the most likely
    product item given top_products_by_aisle. This would help us be more certain
    that the product that I am collecting nutritional data is a lot like the
    one found in the Instacart aisles.

    I am retrieving the FDCID because it is necessary to move to the next
    step which is: pull nutrition data. '''

    # Set API details
    requested_url = 'https://api.nal.usda.gov/fdc/v1/search'
    api_key = '?api_key=dvCyz1caFZ12A2Q04pm7ZQ9b9Z8h4pcK7dl4GI8K'
    headers = {'Content-Type': 'application/json'}

    # Put top products in a list
    top_products_by_aisle_list = top_products_by_aisle['product_name'].tolist()

    fdcID_container = [] # container for results
    for item in top_products_by_aisle_list:
        data = {"generalSearchInput": item} # pull item in list
        data_str = json.dumps(data).encode("utf-8") # convert to json format
        response = requests.post(requested_url + api_key,
        headers=headers, data=data_str) # commit and API request for the item
        parsed = json.loads(response.content) # generate JSON data
        try: # using try for case where one item which has no results
            temp_fdcID = parsed['foods'][0]['fdcId'] # pull first search result
            fdcID_container.append(temp_fdcID) # append the FDCID for use later
        except: # in case of error/no result, append np.nan
            fdcID_container.append(np.nan)

    return fdcID_container

fdcID_list = fdcID_retrieval()

# Using the FDCID data pulled above, conduct another API request to get nutrition
# Nutrient list found on U.S. food product labels
# 1257 - trans fat
# 1293 - trans fat - poly
# 1292 - trans fat - mono
# 1258 - sat. fat
# 1253 - cholesterol
# 1093 - sodium
# 1005 - carbohydrates
# 1079 - fiber
# 2000 - sugars
# 1003 - protein
# 1104 - vit a
# 1162 - vit c
# 1087 - calcium
# 1089 - iron
# 1008 - energy

# Search each FDCID in the USDA database through API for nutritional data
def nutrition_retrieval():
    ''' This function collects the most important nutritional data for each
    of the top 10 most commonly ordered products. It does so by making calls
    to the USDA database, FoodData Central (https://fdc.nal.usda.gov/), and it
    then retrieves the returned JSON data for the relevant nutritional data. '''

    # Set container storage and ordering
    nutrient_container = []
    nutrient_list = ['trans_fat', 'sat_fat', 'cholesterol', 'sodium',
    'carbs', 'fiber','sugars', 'protein', 'vit_a', 'vit_c', 'calcium',
    'iron', 'fdcID']
    nutrient_container.append(nutrient_list)

    # Set API details
    USDA_URL = 'https://api.nal.usda.gov/fdc/v1/'
    API_KEY = 'api_key=dvCyz1caFZ12A2Q04pm7ZQ9b9Z8h4pcK7dl4GI8K'
    headers = {'Content-Type': 'application/json'}

    # Loop over each FDCID; commit a API request for each
    for i in range(0, len(fdcID_list)):
        fdcId = str(fdcID_list[i])
        requested_url = USDA_URL + fdcId + '?' + API_KEY
        response = requests.get(requested_url, headers=headers)
        parsed = json.loads(response.content)
        trans_fat = 0
        trans_fat_poly = 0
        trans_fat_mono = 0
        sat_fat = 0
        cholesterol = 0
        sodium = 0
        carbs = 0
        fiber = 0
        sugars = 0
        protein = 0
        vit_a = 0
        vit_c = 0
        calcium = 0
        iron = 0
        energy = 0
        fdc_id = fdcID_list[i]
        # Loop over dictionary length to look for desired data
        for j in range(0, 120):
            try:
                if parsed['foodNutrients'][j]['nutrient']['id'] == 1257:
                    trans_fat = parsed['foodNutrients'][j]['amount']

                if parsed['foodNutrients'][j]['nutrient']['id'] == 1293:
                    trans_fat_poly = parsed['foodNutrients'][j]['amount']

                if parsed['foodNutrients'][j]['nutrient']['id'] == 1292:
                    trans_fat_mono = parsed['foodNutrients'][j]['amount']

                if parsed['foodNutrients'][j]['nutrient']['id'] == 1258:
                    sat_fat = parsed['foodNutrients'][j]['amount']

                if parsed['foodNutrients'][j]['nutrient']['id'] == 1253:
                    cholesterol = parsed['foodNutrients'][j]['amount']

                if parsed['foodNutrients'][j]['nutrient']['id'] == 1093:
                    sodium = parsed['foodNutrients'][j]['amount']

                if parsed['foodNutrients'][j]['nutrient']['id'] == 1005:
                    carbs = parsed['foodNutrients'][j]['amount']

                if parsed['foodNutrients'][j]['nutrient']['id'] == 1079:
                    fiber = parsed['foodNutrients'][j]['amount']

                if parsed['foodNutrients'][j]['nutrient']['id'] == 2000:
                    sugars = parsed['foodNutrients'][j]['amount']

                if parsed['foodNutrients'][j]['nutrient']['id'] == 1003:
                    protein = parsed['foodNutrients'][j]['amount']

                if parsed['foodNutrients'][j]['nutrient']['id'] == 1104:
                    vit_a = parsed['foodNutrients'][j]['amount']

                if parsed['foodNutrients'][j]['nutrient']['id'] == 1162:
                    vit_c = parsed['foodNutrients'][j]['amount']

                if parsed['foodNutrients'][j]['nutrient']['id'] == 1087:
                    calcium = parsed['foodNutrients'][j]['amount']

                if parsed['foodNutrients'][j]['nutrient']['id'] == 1089:
                    iron = parsed['foodNutrients'][j]['amount']

                if parsed['foodNutrients'][j]['nutrient']['id'] == 1008:
                    energy = parsed['foodNutrients'][j]['amount']

            except: # In case of error; continue anyways
                    pass

        trans_fat = trans_fat + trans_fat_poly + trans_fat_mono

        nutrient_container.append([trans_fat, sat_fat, cholesterol,
        sodium, carbs, fiber, sugars, protein, vit_a, vit_c,
        calcium, iron, energy, fdc_id])

    return nutrient_container

nutrient_list = nutrition_retrieval()

# Turn nutrient_list into df for preprocessing
nutrient_df = pd.DataFrame(nutrient_list[1::], columns = ['trans_fat', 'sat_fat', 'cholesterol',
                                            'sodium', 'carbs', 'fiber', 'sugars',
                                            'protein', 'vit_a', 'vit_c', 'calcium',
                                            'iron', 'energy', 'fdcID']) # sliced to remove column titles

# Preprocess the nutrient data
def nutrient_preprocessing(dataframe):
    ''' This function preprocesses the nutrient data by converting each
    nutrient that I will use in analyze to a common base of 1 kcal. This
    common base is helpful for my next step which is to create a
    weighted nutrient densite score (WNDS). Since the WNDS algorithm bases its
    calculations on 100kcal, this makes multiplication
    more convienient and easier to think about. '''

    # Convert nutrients to base 1 energy
    dataframe['protein'] = dataframe['protein'] / dataframe['energy']
    dataframe['fiber'] = dataframe['fiber'] / dataframe['energy']
    dataframe['trans_fat'] = dataframe['trans_fat'] / dataframe['energy']
    dataframe['sat_fat'] = dataframe['sat_fat'] / dataframe['energy']
    dataframe['sugars'] = dataframe['sugars'] / dataframe['energy']

    dataframe['calcium'] = dataframe['calcium'] / dataframe['energy']
    dataframe['vit_c'] = dataframe['vit_c'] / dataframe['energy']
    dataframe['sodium'] = dataframe['sodium'] / dataframe['energy']

    return dataframe

nutrient_df = nutrient_preprocessing(nutrient_df)

def weighted_nutrient_density_score(dataframe):
    ''' This function calculates the WNDS which is based on an algorithm
    devised by Joanne, Fulgoni, Hersey and Muth. This algorithm is based on
    a statistical analysis of the USDA Healthy Eating Index to determine
    which nutrients explain the most variation in the components scores.
    Those nutrients are: protein, fiber, calcium, trans fat, vitamin c,
    saturated fat, sugars, and sodium. '''

    # Calculate WNDS based on journal article
    wnds_protein = (1.4 * ((dataframe['protein'] * 100) / 50))
    wnds_fiber = (3.13 * ((dataframe['fiber'] * 100) / 25))
    wnds_calcium = (1 * ((dataframe['calcium'] * 100) / 1000))
    wnds_trans_fat = (2.51 * ((dataframe['trans_fat'] * 100) / 44))
    wnds_vit_c = (0.37 * ((dataframe['vit_c'] * 100) / 60))
    wnds_sat_fat = (2.95 * ((dataframe['sat_fat'] * 100) / 20))
    wnds_sugars = (0.52 * ((dataframe['sugars'] * 100) / 50))
    wnds_sodium = (1.34 * ((dataframe['sodium'] * 100) / 2400))

    wnds = (wnds_protein + wnds_fiber + wnds_calcium + wnds_trans_fat +
            wnds_vit_c - wnds_sat_fat - wnds_sugars - wnds_sodium) * 100

    return wnds

wnds = weighted_nutrient_density_score(nutrient_df)

# Place the analyzed data into a new dataframe
matching_df = pd.DataFrame(list(zip(wnds, fdcID_list,
top_products_by_aisle['aisle_id'], top_products_by_aisle['product_name'])),
                           columns = ['wnds_item_score', 'fdcID', 'aisle_id', 'product_name'])

# Apply WNDS score to all foods in the aisle; top 10 most ordered is proxy score for all items in the aisle
matching_df['wnds_aisle_mean'] = matching_df.groupby('aisle_id')['wnds_item_score'].transform('mean')

# Get aisle WNDS score by row
matching_df = matching_df.groupby('aisle_id').agg({'wnds_aisle_mean' : 'max'}).reset_index()

# Join aisle WNDS scores back to main dataframe (filtered DF)
df2 = filtered_df.merge(matching_df, on = ['aisle_id'])

# Build a WNDS score for each user_id by taking the average of their items
df2['userid_wnds_score'] = df2.groupby('user_id')['wnds_aisle_mean'].transform('mean')

# How healthy is the average instacart user?
user_health_aggregate = df2.groupby('user_id')['userid_wnds_score'].agg('max').reset_index()
user_health_aggregate['userid_wnds_score'].mean() # mean equal to median roughly
user_health_aggregate['userid_wnds_score'].median()

# Plot
N = user_health_aggregate['user_id'].shape[0]
trace = go.Scattergl( # scattergl for higher performance
        x = user_health_aggregate['user_id'],
        y = user_health_aggregate['userid_wnds_score'],
        hoverinfo = 'text',
        name = '',
        hovertemplate =
    'User ID: %{x}' +
    '<br>WNDS Score: %{y:.2f}',
        mode = 'markers',
        marker = dict(
        color = np.random.randn(N),
        colorscale = 'Viridis',
        line_width = 0.1))

layout = go.Layout(title="<b>Figure 5: WNDS for each Instacart User </b>",
hovermode = 'closest',
font = dict(size = 18),
xaxis = dict(title_text = 'User ID'),
yaxis = dict(title_text = 'WNDS'))

fig = dict(data = [trace], layout = layout)
plotly.offline.plot(fig) # offline plotting

# How healthy is each aisle?
aisle_mean = df2.groupby('aisle')['wnds_aisle_mean'].agg('max').reset_index()
aisle_mean['wnds_aisle_mean'].median()
aisle_mean['wnds_aisle_mean'].max()
aisle_mean['wnds_aisle_mean'].min()

# filter out NaN from aisle_mean
aisle_mean = aisle_mean.loc[aisle_mean['wnds_aisle_mean'].notnull()]

# Plot
random_subset = aisle_mean.sample(n=25)
N = random_subset['aisle'].shape[0]
random_subset = random_subset.sort_values(by = 'wnds_aisle_mean')

trace = go.Scattergl( # scattergl for higher performance
        x = random_subset['aisle'].str.title(), # title case the aisles
        y = random_subset['wnds_aisle_mean'],
        hoverinfo = 'text',
        name = '',
        hovertemplate =
    'Aisle: %{x}' +
    '<br>WNDS Score: %{y:.2f}',
        mode = 'markers',
        marker = dict(
        color = np.random.randn(N),
        colorscale = 'Viridis',
        line_width = 0.1))

layout = go.Layout(title="<b>Figure 4: WNDS for each Instacart Aisle </b>",
hovermode = 'closest',
font = dict(size = 18),
xaxis = dict(tickangle = 45,
title_text = 'Aisle'),
yaxis = dict(title_text = 'WNDS'),
margin = dict(b=150))

fig = dict(data = [trace], layout = layout)
plotly.offline.plot(fig) # offline plotting

# How healthy are plant-based consumers vs meat-based consumers?
# if running script live -- ignore this load and replace h3 with df2 from above
#h3 = pd.read_csv('C:\\Users\\Andrew\\Desktop\\h3_dataset.csv')
emp =  pd.read_csv('flex-emp.csv') # contains order/product info
h3 = df2.merge(emp, on = ['product_id'])

h3 = h3[(h3.aisle_id==96) | (h3.aisle_id==14) | (h3.aisle_id==106) |
             (h3.aisle_id==122) | (h3.aisle_id==7) | (h3.aisle_id==49) |
             (h3.aisle_id==35) | (h3.aisle_id==34) | (h3.aisle_id==42)] # filter specified foods only

h3 = h3[h3.emp != 'E'] # remove excluded items

def h3_comparison(dataframe):
    counts = dataframe.groupby(['user_id', 'emp']).size().reset_index()
    counts = counts.rename(columns={0: "item_count"})

    # get total counts of M + P
    total_counts = counts.groupby('user_id').sum().reset_index()
    total_counts = total_counts.rename(columns={'item_count': "total_count"})

    # build meat_count var
    meat_count = counts[counts.emp == 'M'].groupby('user_id')['item_count'].max().reset_index()
    meat_count = meat_count.rename(columns={'item_count': "meat_count"})

    # build plant_count var
    plant_count = counts[counts.emp == 'P'].groupby('user_id')['item_count'].max().reset_index()
    plant_count = plant_count.rename(columns={'item_count': "plant_count"})

    # join counts with original df
    dataframe = dataframe.merge(total_counts, how = 'outer', on = ['user_id'])
    dataframe = dataframe.merge(meat_count, how = 'outer', on = ['user_id'])
    dataframe = dataframe.merge(plant_count, how = 'outer', on = ['user_id'])

    # fill missing data with 0
    dataframe['meat_count'] = dataframe['meat_count'].fillna(0)
    dataframe['plant_count'] = dataframe['plant_count'].fillna(0)

    # get ratio -- base is meat
    ratio = dataframe.groupby('user_id').apply(lambda x: x['meat_count'] / x['total_count']).reset_index()
    ratio = ratio.rename(columns={0: "ratio"})

    # drop unnecessary column
    ratio = ratio.drop(columns=['level_1'])

    # add ratio to df
    dataframe = dataframe.merge(ratio, on = ['user_id'])

    # aggregate
    summarized_stats = dataframe.groupby('user_id').agg({'ratio' : 'max',
    'userid_wnds_score':'max'}).reset_index()

    return summarized_stats

ratios = h3_comparison(h3)

# Create empty column
ratios['categories'] = None

# Find user types:
carnivore = ratios['ratio'] == 1.00 # only buys meat
flexitarian = (ratios['ratio'] > 0) & (ratios['ratio'] < 1.00) # buys a mix
vegetarian = (ratios['ratio'] == 0.00) # only buys plants

# Apply mask
ratios.loc[carnivore, 'categories'] = 'Carnivore'
ratios.loc[flexitarian, 'categories'] = 'Flexitarian'
ratios.loc[vegetarian, 'categories'] = 'Vegetarian'

# Check distribution
ratios['categories'].value_counts()

# Subset data
carnivores = ratios[ratios.categories == 'Carnivore']
flexitarians = ratios[ratios.categories == 'Flexitarian']
vegetarians = ratios[ratios.categories == 'Vegetarian']

# Generate sample means
carnivores_mean = carnivores['userid_wnds_score'].mean()
flexitarians_mean = flexitarians['userid_wnds_score'].mean()
vegetarians_mean = vegetarians['userid_wnds_score'].mean()
# Plant eaters slightly healthier

# Dot plot
y_axis = ['Carnivores', 'Flexitarians', 'Vegetarians', 'Average User']
user_type_mean = [carnivores_mean, flexitarians_mean, vegetarians_mean, 32.29]
df_dotplot = pd.DataFrame({'y' : y_axis, 'x' : user_type_mean})
df_dotplot = df_dotplot.sort_values(by = 'x')

trace = go.Scattergl(
    x = df_dotplot['x'],
    y = df_dotplot['y'],
    mode = 'markers',
    marker = dict(
        color = 'rgba(156, 165, 196, 0.95)',
        line_color = 'rgba(156, 165, 196, 1.0)',
        line_width = 1,
        symbol = 'circle',
        size = 16))

layout = go.Layout(
    title = "<b>Figure 6: Average Mean WNDS by Shopper Type (Higher is Better) </b>",
    font = dict(size = 18),
    yaxis = dict(
    title_text = 'Instacart User Type'),
    xaxis = dict(
        showgrid = False,
        showline = True,
        title_text = "Weighted Nutrient Density Score",
        linecolor = 'rgb(102, 102, 102)',
        tickfont_color = 'rgb(102, 102, 102)',
        showticklabels = True,
        dtick = 0.5,
        ticks = 'outside',
        tickcolor = 'rgb(102, 102, 102)',),
    margin = dict(l=140, r=40, b=50, t=80),
    legend = dict(
        font_size = 10,
        yanchor = 'middle',
        xanchor = 'right',
    ),
    width = 1200,
    height = 800,
    paper_bgcolor = 'white',
    plot_bgcolor = 'white',
    hovermode = 'closest')

fig = dict(data = [trace], layout = layout)
plotly.offline.plot(fig) # offline plotting

# USDA Sanity Checks
import random
choices = random.choices(fdcID_list, k = 5)
choices # [171520, 458412, 171925, 341511, 167551]
for i in choices:
    print(matching_df[matching_df.fdcID == i]['product_name'])

Arsenault, Joanne E., Victor L. Fulgoni III, James C. Hersey, and Mary K. Muth. “A novel approach to selecting and weighting nutrients for nutrient profiling of foods and diets.” Journal of the Academy of Nutrition and Dietetics 112, no. 12 (2012): 1968-1975.↩︎
The algorithm only considers the following nutrients: protein, fiber, trans fat, saturated fat, sugars, calcium, vitamin c, and sodium.↩︎

Nutritionalcart: Adding Nutritional Information to Instacart Groceries

Andrew Fogarty

01/13/2020

1 Introduction

2 Measurement

3 Descriptive Inference

4 Hypothesis Testing

5 Conclusion

6 Code

7 Sources