# IMPORTS ==============================================================
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats

import src.style # A file which helps me style all the plots more consistently


# READ TREES CSV & PRINT BASIC FACTS ===================================
trees = pd.read_csv('data/trees.csv')
print("Latitude Range:", trees.latitude.min(), trees.latitude.max())
print("Longitude Range:", trees.longitude.min(), trees.longitude.max())
print("Duplicated tree_id:", trees.tree_id.value_counts().max()>2)
print("Number of trees:", len(trees), trees.status.value_counts().to_dict())
print("Number of unique species of tree:")
trees.head(1)

Latitude Range: 40.70178954 40.87305363
Longitude Range: -74.01836049 -73.91120169
Duplicated tree_id: False
Number of trees: 64229 {'Alive': 62427, 'Dead': 1802}
Number of unique species of tree:


# READ NTA SHP DATA & PRINT BASIC FACTS ================================
neighborhoods = gpd.read_file('data/nta.shp')
print("Number of Manhattan neighbourhoods:", neighborhoods[neighborhoods["boroname"]=="Manhattan"]["ntaname"].nunique())
print(
    "No trees in the following neighborhoods of Manhattan:",
    set(neighborhoods[neighborhoods["boroname"]=="Manhattan"]["ntaname"].unique()) - set(trees["nta_name"].unique())
)

neighborhoods['point'] = neighborhoods['geometry'].apply(lambda x: x.centroid.coords[0])
neighborhoods.head(1)

Number of Manhattan neighbourhoods: 29
No trees in the following neighborhoods of Manhattan: {'park-cemetery-etc-Manhattan'}


print(
    "Proportion of trees Alive/Dead:",
    (trees["status"].value_counts() / len(trees)).round(3).to_dict()
)

print(
    "Any missing values for living trees?", 
    trees[trees.status == "Alive"].isna().any().any()
)

print("The only dead tree where 'spc_common' is given:")
trees[(trees["status"]=="Dead") & (~trees["spc_common"].isna())]

Proportion of trees Alive/Dead: {'Alive': 0.972, 'Dead': 0.028}
Any missing values for living trees? False
The only dead tree where 'spc_common' is given:


print("Value proportions for some categorical columns of `trees`:")

props = pd.DataFrame()
props["Status=Any"] = trees.fillna("NAN").drop(columns=["spc_common", "tree_id", "tree_dbh", "postcode", "nta", "nta_name", "longitude", "latitude"]) \
    .apply(lambda s: (s.value_counts() / len(s)).round(3).to_dict(), axis=0)
props["Status=Dead"] = trees[trees.status == "Dead"].fillna("NAN").drop(columns=["spc_common", "tree_id", "tree_dbh", "postcode", "nta", "nta_name", "longitude", "latitude"]) \
    .apply(lambda s: (s.value_counts() / len(s)).round(3).to_dict(), axis=0)
    
pd.options.display.max_colwidth = None
props

Value proportions for some categorical columns of `trees`:

1.a: Most Common Tree Species In Manhattan ¶


# PLOT PIE CHART OF TREE COUNTS ========================================

def plot_tree_counts(data, num_labelled=8):
    counts = data["spc_common"].value_counts().to_frame("count") \
        .reset_index().rename(columns={"index": "spc_common"})
    counts["percent"] = counts["count"] / counts["count"].sum()
    counts["label"] = counts["spc_common"].str.title() + counts["percent"].apply(lambda x: f" ({x:.1%})")
    
    fig, ax = plt.subplots(1, 1)
    wedges, texts = ax.pie(counts["count"], wedgeprops=dict(width=0.5), startangle=-50)

    bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
    kw = dict(arrowprops=dict(arrowstyle="-"),
            bbox=bbox_props, zorder=0, va="center")

    for i, p in enumerate(wedges[:num_labelled]):
        ang = (p.theta2 - p.theta1)/2. + p.theta1
        y = np.sin(np.deg2rad(ang))
        x = np.cos(np.deg2rad(ang))
        horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
        connectionstyle = "angle,angleA=0,angleB={}".format(ang)
        kw["arrowprops"].update({"connectionstyle": connectionstyle})
        ax.annotate(counts["label"].loc[i], xy=(x, y), xytext=(1.35*np.sign(x), 1.4*y),
                    horizontalalignment=horizontalalignment, **kw)

    return fig, ax

fig, ax = plot_tree_counts(trees)
fig.suptitle("Species of Tree in Manhattan (Status=Any)", y=1.05);
# fig.savefig("figs/species.png", facecolor="white", bbox_inches="tight");
plt.close()


# SETUP FOR FOLLOWING GEOPANDAS MAP PLOTS ==============================
# SOME GEOPANDAS PLOTTING KWARGS ---------------------------------------
annot_kwargs = dict(
    textcoords='axes fraction',
    arrowprops=dict(color='red', shrink=0.05),
    horizontalalignment='right', 
    verticalalignment='top', 
    xycoords='data'
)
missing_kwds = {
    "facecolor": "lightgrey",
    "edgecolor": "grey",
    "hatch": "///",
}

# A TWO PANEL LAYOUT I WILL USE REPEATEDLY -----------------------------
def dual_layout(suptitle, titles, figsize=(18, 20), y=0.82):
    fig, ax = plt.subplots(1, 2, figsize=(18, 20))
    fig.suptitle(suptitle, y=y, fontsize=16)
    for i, a in enumerate(ax):
        a.set_title(titles[i], fontsize=16)
        a.set_xlabel("longitude", fontsize=12)
        a.set_ylabel("latitude", fontsize=12)
    return fig, ax

# SOME OBJECTS TO ENABLE CUSTOM LEGENDS --------------------------------
import matplotlib.patches as mpatches
class AnyObject(object):
    pass

class AnyObjectHandler(object):
    def legend_artist(self, legend, orig_handle, fontsize, handlebox):
        x0, y0 = handlebox.xdescent, handlebox.ydescent
        width, height = handlebox.width, handlebox.height
        patch = mpatches.Rectangle([x0, y0], width, height, lw=1,
                                   transform=handlebox.get_transform(),
                                   **missing_kwds)
        handlebox.add_artist(patch)
        return patch


# PLOTTING NEIGHBORHOOD TREE COUNTS AND DENSITIES ======================
# PREP DATA ------------------------------------------------------------
d = pd.DataFrame()
d["Alive"] = trees[trees["status"]=="Alive"]["nta_name"].value_counts()
d["Dead"] = trees[trees["status"]=="Dead"]["nta_name"].value_counts()
d = d.reset_index()
d.columns = ["nta_name", "alive", "dead"]
d["total"] = d["alive"] + d["dead"]
d["survival"] = d["alive"] / d["total"]
d = d.append(pd.Series({"nta_name": 'park-cemetery-etc-Manhattan'}), ignore_index=True) #.fillna(0)
# print(d.head())

man = pd.merge(neighborhoods[neighborhoods["boroname"]=="Manhattan"], d, left_on="ntaname", right_on="nta_name")
assert man["boroname"].nunique() == 1
# print(man.head())

man["alive/area"] = man["alive"].divide(man["shape_area"])
man["alive_rank"] = man["alive"].rank(ascending=False)
man["alive/area_rank"] = man["alive/area"].rank(ascending=False)

# CREATE CANVAS --------------------------------------------------------
fig, ax = plt.subplots(1, 2, figsize=(19, 13))
fig.suptitle("Number & Density of Living Trees in Each Neighborhood", y=0.92, fontsize=16)
ax[0].set_title("Number of Living Trees", fontsize=16)
ax[1].set_title("Density of Living Trees", fontsize=16)
ax[0].set_xlabel("longitude", fontsize=14)
ax[0].set_ylabel("latitude", fontsize=14)
ax[1].set_xlabel("longitude", fontsize=14)
ax[1].set_ylabel("latitude", fontsize=14)

# PLOT MAP -------------------------------------------------------------
man.plot("alive", cmap="Greens", ax=ax[0], legend=True)
man.plot("alive/area", cmap="Greens", ax=ax[1], legend=True)
man[man.nta_name=="park-cemetery-etc-Manhattan"].plot(ax=ax[0], **missing_kwds)
man[man.nta_name=="park-cemetery-etc-Manhattan"].plot(ax=ax[1], **missing_kwds)

# ANNOTATIONS ----------------------------------------------------------
for i, row in man.iterrows():
    if row["nta_name"] != 'park-cemetery-etc-Manhattan':
        ax[0].annotate(int(row["alive_rank"]), xy=row["point"], color="white" if row["alive_rank"]==1 else "black")
        ax[1].annotate(int(row["alive/area_rank"]), xy=row["point"], color="white" if row["alive/area_rank"]==1 else "black")

# Annotate top neighbourhoods
man = man.sort_values("alive", ascending=False)
ax[0].annotate(man.iloc[0]["ntaname"], xy=man.iloc[0]["point"], xytext=(0.3, 0.6), **annot_kwargs)
ax[0].annotate(man.iloc[1]["ntaname"], xy=man.iloc[1]["point"], xytext=(0.95, 0.3), **annot_kwargs)
ax[0].annotate(man.iloc[2]["ntaname"], xy=man.iloc[2]["point"], xytext=(0.7, 0.15), **annot_kwargs)

man = man.sort_values("alive/area", ascending=False)
ax[1].annotate(man.iloc[0]["ntaname"], xy=man.iloc[0]["point"], xytext=(0.95, 0.3), **annot_kwargs)
ax[1].annotate(man.iloc[1]["ntaname"], xy=man.iloc[1]["point"], xytext=(0.5, 0.75), **annot_kwargs)
ax[1].annotate(man.iloc[2]["ntaname"], xy=man.iloc[2]["point"], xytext=(0.3, 0.6), **annot_kwargs)

man[["ntaname", "alive", "alive_rank", "alive/area", "alive/area_rank"]].to_csv("results/density.csv", index=False)

# LEGEND ---------------------------------------------------------------
ax[0].legend([AnyObject()], ['park-cemetery-etc-Manhattan'],
           handler_map={AnyObject: AnyObjectHandler()}, loc="upper left")
ax[1].legend([AnyObject()], ['park-cemetery-etc-Manhattan'],
           handler_map={AnyObject: AnyObjectHandler()}, loc="upper left")

# fig.savefig("figs/density.png", bbox_inches="tight", facecolor="white")
plt.close()


# SCATTERPLOT ON MAP SHOWING LIVING TREE LOCATIONS =====================
# CREATE CANVAS --------------------------------------------------------
fig, ax = dual_layout(
    suptitle="Manhattan's Neighborhoods and Tree Locations",
    titles=("Manhattan's Neighborhoods", "Manhattan's Tree Locations")
)

# PLOT MAP -------------------------------------------------------------
man[man.nta_name=="park-cemetery-etc-Manhattan"].plot(ax=ax[0], **missing_kwds)
man[man.nta_name=="park-cemetery-etc-Manhattan"].plot(ax=ax[1], **missing_kwds)
man.dropna().plot(ax=ax[0], cmap="Pastel1", edgecolor="black")
man.dropna().plot(ax=ax[1], cmap="Pastel1", edgecolor="black")

sns.scatterplot(data=trees, x="longitude", y="latitude", ax=ax[1],
                s=1, alpha=0.3,
                # hue="status",
                color="green"
                )

# ANNOTATIONS ----------------------------------------------------------
for i, row in man.dropna().iterrows():
    ax[0].annotate(row["ntacode"], xy=row["point"], 
        horizontalalignment="center", fontsize=10,
        verticalalignment="top")

# LEGEND ---------------------------------------------------------------
ax[0].legend([AnyObject()], ['park-cemetery-etc-Manhattan'],
           handler_map={AnyObject: AnyObjectHandler()}, loc="upper left")
ax[1].legend([AnyObject()], ['park-cemetery-etc-Manhattan'],
           handler_map={AnyObject: AnyObjectHandler()}, loc="upper left")

# fig.savefig("figs/trees.png", bbox_inches="tight", facecolor="white")
plt.close()


# RECOMMENDATIONS ======================================================
# FILTER TO LIVING TREES ONLY ------------------------------------------
df = trees[trees.status == "Alive"].copy()
print(f"Proportion of trees alive (usable): {len(df)/len(trees):.1%}")

# HEALTH CATEGORY ->> ORDINAL ------------------------------------------
health_scores = ["Poor", "Fair", "Good"]
df["health"] = df["health"].map(lambda h: health_scores.index(h))

# PER SPECIES AGGREGATES -----------------------------------------------
df = df.groupby("spc_common").agg({
        "spc_common": ["count"], 
        "tree_dbh": ["mean", "std"], 
        "health": ["mean", "std"],
        "nta": ["nunique"],
    })

# COMPUTE OVERALL_SCORE ------------------------------------------------
def min_max_norm(column):
    return (column - column.min()) / (column.max() - column.min())
df[("overall_score", "mean")] = (min_max_norm(df[("health", "mean")]) + min_max_norm(df[("tree_dbh", "mean")])) / 2

# EXCLUDE SMALL SAMPLES ------------------------------------------------
df2 = df[df[("spc_common", "count")] >= 100].copy()

# RANK BY OVERALL_SCORE ------------------------------------------------
df2[("overall_score", "rank")] = df2[("overall_score", "mean")].rank(ascending=False)
# l = df2.sort_values(by=("overall_score", "mean"), ascending=False).head(10).index.to_list()
# print("\n".join([f"{r+1}. {name.title()}" for r, name in enumerate(l)]))
df2.sort_values(by=("overall_score", "mean"), ascending=False).head(10)

Proportion of trees alive (usable): 97.2%


# PRE-PROCESSING FOR FEATURE IMPORTANCE ANALYSIS =======================

df = trees[trees.status == "Alive"].copy()
print(f"Proportion of trees alive (usable): {len(df)/len(trees):.1%}")

# Encode the health categories as ordinal data -------------------------
health_scores = ["Poor", "Fair", "Good"]
df["health"] = df["health"].map(lambda h: health_scores.index(h))

# Transform binary data into booleans ----------------------------------
features_bin = [
    'root_stone', 'root_grate', 'root_other', 'trunk_wire', 'trnk_light',
    'trnk_other', 'brch_light', 'brch_shoe', 'brch_other'
]
df[features_bin] = df[features_bin] == "Yes"
df["on_curb"] = df["curb_loc"] == "OnCurb"
df.drop(columns=["curb_loc"], inplace=True)
features_bin.append("on_curb")

# View the value proportions for the features --------------------------
df.drop(columns=["status"], inplace=True)
df.drop(columns=["spc_common", "tree_id", "tree_dbh", "postcode", "nta", "nta_name", "longitude", "latitude"]) \
    .apply(lambda s: (s.value_counts() / len(s)).round(3).to_dict(), axis=0)

Proportion of trees alive (usable): 97.2%

health        {2: 0.759, 1: 0.184, 0: 0.058}
root_stone       {False: 0.799, True: 0.201}
root_grate         {False: 0.96, True: 0.04}
root_other         {False: 0.92, True: 0.08}
trunk_wire       {False: 0.985, True: 0.015}
trnk_light       {False: 0.995, True: 0.005}
trnk_other       {False: 0.911, True: 0.089}
brch_light       {False: 0.986, True: 0.014}
brch_shoe        {False: 0.999, True: 0.001}
brch_other       {False: 0.895, True: 0.105}
on_curb          {True: 0.932, False: 0.068}
dtype: object


# DEFINE THE FEATURES AND REGRESSION TARGET ============================
features_cont = [
    'tree_dbh', 
]
features_cat = [
    'nta_name', 'spc_common', 
]
features = features_bin + features_cont + features_cat
target = "health" # categorical (ordinal)


# PAIRWISE CORRELATIONS BETWEEN FEATURES AND TARGET ====================
df1 = df[features_bin + features_cont + [target]]
cor = df1.corr()
# print(len(cor)) # 11 features + 1 target

cor.style.background_gradient(cmap='bwr', vmin=-1, vmax=1).set_precision(2)


# CREATE THE RANDOM FOREST MODEL =======================================
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

features1 = features_bin + features_cont
X = df[features1]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=123)

forest = RandomForestClassifier(random_state=0)
forest.feature_names_in_ = features1
forest.fit(X_train, y_train)
forest.score(X_test, y_test)

0.7535080412635355


# COMPUTE & PLOT RESULTS FROM TWO TYPES OF FEATURE IMPORTANCE ==========
from src.forest_importance import *

forest_importances, std = mdi(forest)
fig, ax = plot_mdi(forest_importances, std);
# fig.savefig("figs/feature_importance_mdi.png", facecolor="white", bbox_inches="tight")
# plt.close()

forest_importances, std = perm(forest, X_test, y_test, random_state=123)
fig, ax = plot_perm(forest_importances, std);
# fig.savefig("figs/feature_importance_perm.png", facecolor="white", bbox_inches="tight")
# plt.close()

Elapsed time to compute the importances: 0.014 seconds
Elapsed time to compute the importances: 13.037 seconds


# SCATTERPLOT ON MAP SHOWING DEAD TREE LOCATIONS =======================
# CREATE CANVAS --------------------------------------------------------
fig, ax = dual_layout(
    suptitle="Manhattan's Neighborhoods and Dead Tree Locations",
    titles=("Manhattan's Neighborhoods", "Manhattan's Dead Tree Locations")
)

# PLOT MAP -------------------------------------------------------------
man[man.nta_name=="park-cemetery-etc-Manhattan"].plot(ax=ax[0], **missing_kwds)
man[man.nta_name=="park-cemetery-etc-Manhattan"].plot(ax=ax[1], **missing_kwds)
man.dropna().plot(ax=ax[0], cmap="Pastel1", edgecolor="black")
man.dropna().plot(ax=ax[1], cmap="Pastel1", edgecolor="black")

sns.scatterplot(data=trees[trees.status=="Dead"], x="longitude", y="latitude", ax=ax[1],
                s=1, alpha=1,
                # hue="status",
                color="red"
                )

# ANNOTATIONS ----------------------------------------------------------
for i, row in man.dropna().iterrows():
    ax[0].annotate(row["ntacode"], xy=row["point"], 
        horizontalalignment="center", fontsize=10,
        verticalalignment="top")

# LEGEND ---------------------------------------------------------------
ax[0].legend([AnyObject()], ['park-cemetery-etc-Manhattan'],
           handler_map={AnyObject: AnyObjectHandler()}, loc="upper left")
ax[1].legend([AnyObject()], ['park-cemetery-etc-Manhattan'],
           handler_map={AnyObject: AnyObjectHandler()}, loc="upper left")

# fig.savefig("figs/trees_dead.png", bbox_inches="tight", facecolor="white")
plt.close()


# KDEPLOT ON MAP SHOWING TREE DENSITY ==================================
# CREATE CANVAS --------------------------------------------------------
fig, ax = dual_layout(
    suptitle="Manhattan's Neighborhoods and Tree Density",
    titles=("Manhattan's Neighborhoods", "Manhattan's Tree Density (KDE)")
)

# PLOT MAP -------------------------------------------------------------
man[man.nta_name=="park-cemetery-etc-Manhattan"].plot(ax=ax[0], **missing_kwds)
man[man.nta_name=="park-cemetery-etc-Manhattan"].plot(ax=ax[1], **missing_kwds)
man.dropna().plot(ax=ax[0], cmap="Pastel1", edgecolor="black")

# COMPUTE KDE ----------------------------------------------------------
try:
    Z # Don't recalculate
except NameError:
    # xmax, ymax = trees[["longitude", "latitude"]].max()
    # xmin, ymin = trees[["longitude", "latitude"]].min()
    xmin, xmax = ax[0].get_xlim()
    ymin, ymax = ax[0].get_ylim()
    X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
    positions = np.vstack([X.ravel(), Y.ravel()])
    values = np.vstack([trees["longitude"].values, trees["latitude"].values])
    kernel = stats.gaussian_kde(values)
    Z = np.reshape(kernel(positions).T, X.shape)

# PLOT MAP -------------------------------------------------------------
ax[1].scatter(trees["longitude"].values, trees["latitude"].values,
            s=0.5, 
            #   color="darkgreen", 
            color="black",
            alpha=0.2)
ax[1].imshow(np.rot90(Z),
             extent=[xmin, xmax, ymin, ymax],
             cmap=plt.cm.Greens)
man.boundary.plot(ax=ax[1], edgecolor="grey", linewidth=0.5)

# ANNOTATIONS ----------------------------------------------------------
for i, row in man.dropna().iterrows():
    ax[0].annotate(row["ntacode"], xy=row["point"], 
        horizontalalignment="center", fontsize=10,
        verticalalignment="top")

# LEGEND ---------------------------------------------------------------
ax[0].legend([AnyObject()], ['park-cemetery-etc-Manhattan'],
           handler_map={AnyObject: AnyObjectHandler()}, loc="upper left")
ax[1].legend([AnyObject()], ['park-cemetery-etc-Manhattan'],
           handler_map={AnyObject: AnyObjectHandler()}, loc="upper left")

# fig.savefig("figs/trees_kde.png", bbox_inches="tight", facecolor="white")
plt.close()


# PLOT THE BOROS OF NEW YORK ===========================================
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
ax.set_title("Boros", fontsize=16)
ax.set_xlabel("longitude", fontsize=14)
ax.set_ylabel("latitude", fontsize=14)

neighborhoods.plot("boroname", ax=ax, 
                   legend=True, legend_kwds={"loc": "upper left"},
                   cmap=src.style.cmap,
                   )
# fig.savefig("figs/boros.png", facecolor="white", bbox_inches="tight")
plt.close()


# DIAMETER VS HEALTH ===================================================
# FILTER TO LIVING TREES ONLY ------------------------------------------
df = trees[trees.status == "Alive"].copy()

# HEALTH CATEGORY ->> ORDINAL ------------------------------------------
health_scores = ["Poor", "Fair", "Good"]
df["health"] = df["health"].map(lambda h: health_scores.index(h))

# PER SPECIES AGGREGATES -----------------------------------------------
df = df.groupby("spc_common").agg({
        "spc_common": ["count"], 
        "tree_dbh": ["mean", "std"], 
        "health": ["mean", "std"],
        "nta": ["nunique"],
    })

# PLOT -----------------------------------------------------------------
fig, ax = plt.subplots(1, 1, figsize=(10, 7))
minsize = df[("spc_common", "count")].min() / 2 + 10
maxsize = df[("spc_common", "count")].max() / 2
sns.scatterplot(
    data=df,
    ax=ax,
    x=("health", "mean"),
    y=("tree_dbh", "mean"),
    alpha=0.5, 
    size=("spc_common", "count"), sizes=(minsize, maxsize),
);

ax.set_title("Mean Species Diameter vs Health Score\n(Radius represents number of living trees for that species)")
ax.set_xlabel("Mean Species Health Score")
ax.set_ylabel("Mean Species Diameter (inches)")
ax.set_xticks(range(3));
ax.set_xticklabels(health_scores);
ax.legend().set_visible(False)
# fig.savefig("figs/diameter_vs_health.png", facecolor="white", bbox_inches="tight")
plt.close()

	Status=Any	Status=Dead
curb_loc	{'OnCurb': 0.933, 'OffsetFromCurb': 0.067}	{'OnCurb': 0.956, 'OffsetFromCurb': 0.044}
status	{'Alive': 0.972, 'Dead': 0.028}	{'Dead': 1.0}
health	{'Good': 0.737, 'Fair': 0.178, 'Poor': 0.056, 'NAN': 0.028}	{'NAN': 1.0}
root_stone	{'No': 0.804, 'Yes': 0.196}	{'No': 1.0}
root_grate	{'No': 0.961, 'Yes': 0.039}	{'No': 1.0}
root_other	{'No': 0.922, 'Yes': 0.078}	{'No': 1.0}
trunk_wire	{'No': 0.986, 'Yes': 0.014}	{'No': 1.0}
trnk_light	{'No': 0.995, 'Yes': 0.005}	{'No': 1.0}
trnk_other	{'No': 0.913, 'Yes': 0.087}	{'No': 1.0}
brch_light	{'No': 0.986, 'Yes': 0.014}	{'No': 1.0}
brch_shoe	{'No': 0.999, 'Yes': 0.001}	{'No': 1.0}
brch_other	{'No': 0.898, 'Yes': 0.102}	{'No': 1.0}

	spc_common	tree_dbh		health		nta	overall_score
	count	mean	std	mean	std	nunique	mean	rank
spc_common
American elm	1698	13.899293	9.703312	1.755595	0.526030	28	0.918532	1.0
Siberian elm	156	12.064103	7.545714	1.794872	0.504406	17	0.865607	2.0
willow oak	889	10.811024	9.331016	1.809899	0.463624	27	0.825538	3.0
tree of heaven	104	11.451923	6.584595	1.740385	0.539601	18	0.825422	4.0
London planetree	4122	13.168607	7.340801	1.537603	0.658447	28	0.819583	5.0
pin oak	4584	10.068499	7.982340	1.784686	0.476936	28	0.790423	6.0
black locust	259	9.768340	4.735689	1.752896	0.482987	26	0.769029	7.0
honeylocust	13175	9.057837	3.997076	1.816167	0.425643	28	0.764560	8.0
Sophora	4453	9.225915	4.892435	1.756344	0.517578	28	0.750666	9.0
green ash	770	9.255844	4.371351	1.688312	0.576067	27	0.729065	10.0

1.a: Most Common Tree Species In Manhattan ¶

Which tree species should the city plant?¶

🎯 Aim¶

💡 Summary¶

Part 1: Exploratory Data Analysis¶

Part 2: Modelling & Recommendations¶

🧭 Part 1: Exploratory Data Analysis¶

Load Data¶

Tree Census¶

Neighborhoods' geographical information¶

Missing Data for Dead Trees

1.b: Which Are The Neighborhoods With The Most Trees?¶

1.c: A Visualization of Manhattan's Neighborhoods and Tree Locations ¶

🧠 Part 2: Recommendations ¶

Method¶

Conclusion¶

This Data Can't Tell The Whole Story ...¶

📎 Appendix¶

A. Predictors of Tree Health - Feature Importance Analysis¶

B. Additional Plots for Manhattan Tree Locations¶

C. Boros of New York¶

D. Survivor Bias - We Have More Data on Healthy Trees Because They Survive¶

📸 Image Attributions¶

❤️ Wow!¶

I can't believe you made it all the way to the end thank you so much!¶

	root_stone	root_grate	root_other	trunk_wire	trnk_light	trnk_other	brch_light	brch_shoe	brch_other	on_curb	tree_dbh	health
root_stone	1.00	-0.06	0.06	0.03	0.02	0.08	0.05	0.02	0.09	0.02	0.14	-0.05
root_grate	-0.06	1.00	0.03	0.00	0.04	0.05	0.02	0.00	0.04	0.00	-0.02	-0.04
root_other	0.06	0.03	1.00	0.03	0.04	0.20	0.03	0.01	0.18	0.04	0.05	-0.06
trunk_wire	0.03	0.00	0.03	1.00	0.09	0.02	0.16	-0.00	0.04	0.01	-0.01	-0.03
trnk_light	0.02	0.04	0.04	0.09	1.00	0.01	0.34	-0.00	0.01	0.00	0.00	-0.00
trnk_other	0.08	0.05	0.20	0.02	0.01	1.00	0.02	0.01	0.25	0.02	0.02	-0.15
brch_light	0.05	0.02	0.03	0.16	0.34	0.02	1.00	0.01	0.02	0.01	0.00	-0.03
brch_shoe	0.02	0.00	0.01	-0.00	-0.00	0.01	0.01	1.00	0.01	-0.01	0.01	-0.01
brch_other	0.09	0.04	0.18	0.04	0.01	0.25	0.02	0.01	1.00	0.04	0.00	-0.19
on_curb	0.02	0.00	0.04	0.01	0.00	0.02	0.01	-0.01	0.04	1.00	-0.14	-0.04
tree_dbh	0.14	-0.02	0.05	-0.01	0.00	0.02	0.00	0.01	0.00	-0.14	1.00	0.11
health	-0.05	-0.04	-0.06	-0.03	-0.00	-0.15	-0.03	-0.01	-0.19	-0.04	0.11	1.00

Which tree species should the city plant?¶

🎯 Aim¶

💡 Summary¶

Part 1: Exploratory Data Analysis¶

Part 2: Modelling & Recommendations¶

🧭 Part 1: Exploratory Data Analysis¶

Load Data¶

Tree Census¶

Neighborhoods' geographical information¶

Missing Data for Dead Trees

1.a: Most Common Tree Species In Manhattan¶

1.b: Which Are The Neighborhoods With The Most Trees?¶

1.c: A Visualization of Manhattan's Neighborhoods and Tree Locations¶

🧠 Part 2: Recommendations¶

Method¶

Conclusion¶

This Data Can't Tell The Whole Story ...¶

📎 Appendix¶

A. Predictors of Tree Health - Feature Importance Analysis¶

B. Additional Plots for Manhattan Tree Locations¶

C. Boros of New York¶

D. Survivor Bias - We Have More Data on Healthy Trees Because They Survive¶

📸 Image Attributions¶

❤️ Wow!¶

I can't believe you made it all the way to the end thank you so much!¶

1.a: Most Common Tree Species In Manhattan ¶

1.c: A Visualization of Manhattan's Neighborhoods and Tree Locations ¶

🧠 Part 2: Recommendations ¶