Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

import geopandas as gpd

# Provide the path to your GIS file (e.g., .shp file)
file_path = 'calenviroscreen40shpf2021shp/CES4 Final Shapefile.shp'

# Load the shapefile
gdf = gpd.read_file(file_path)

# Display the first few rows of the GeoDataFrame
gdf.head()
# map the data
gdf.plot()
# View the first few rows to get an overview
gdf.head()
# Check the columns in the dataset
gdf.columns
# Check for missing values
gdf.isnull().sum()
# Summary statistics for numerical columns
gdf.describe()
import matplotlib.pyplot as plt

# Create a figure with subplots to compare pollution burden and population vulnerability
fig, ax = plt.subplots(1, 2, figsize=(15, 7))

# Plot Pollution Burden on the left
gdf.plot(column='PolBurdSc', ax=ax[0], legend=True, cmap='OrRd')
ax[0].set_title('Pollution Burden Score')

# Plot Population Vulnerability on the right (example with 'Poverty' column)
gdf.plot(column='Poverty', ax=ax[1], legend=True, cmap='PuBu')
ax[1].set_title('Population Vulnerability (Poverty)')

plt.show()
# Sort the data by Cumulative Impact Score and take the top 10
top_10 = gdf.nlargest(10, 'CIscore')

# Create a bar plot
top_10.plot(kind='bar', x='Tract', y='CIscore', legend=False, color='red')

plt.title('Top 10 Most Impacted Communities')
plt.ylabel('Cumulative Impact Score')
plt.xticks(rotation=45, ha='right')
plt.show()
# Scatter plot comparing Pollution Burden Score to Poverty Score
gdf.plot.scatter(x='PolBurdSc', y='Poverty', alpha=0.5, color='blue')

plt.title('Pollution Burden vs Poverty')
plt.xlabel('Pollution Burden Score')
plt.ylabel('Poverty Score')
plt.show()
# Filter for a specific region, e.g., Los Angeles County
la_county = gdf[gdf['County'] == 'Los Angeles']

# Plot the Air Quality Score for LA County
la_county.plot(column='PM2_5', legend=True, cmap='coolwarm')

plt.title('PM2.5 Air Quality in Los Angeles County')
plt.show()
# Select a few columns to compare
hazards = gdf[['Ozone', 'PM2_5', 'DieselPM']]

# Plot a multi-line chart
hazards.plot(figsize=(10, 6))

plt.title('Comparison of Ozone, PM2.5, and Diesel Particulate Matter Across Regions')
plt.xlabel('Region (Tracts or ZIP)')
plt.ylabel('Concentration')
plt.legend(['Ozone', 'PM2.5', 'Diesel PM'])
plt.show()
import seaborn as sns

# Select columns for correlation matrix
corr_columns = ['Poverty', 'Unempl', 'HousBurd', 'PolBurdSc', 'Asthma', 'Cardiovas']

# Create a correlation matrix
corr_matrix = gdf[corr_columns].corr()

# Plot the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Between Population Characteristics and Pollution Burden')
plt.show()
# Set up a stacked bar plot for racial composition by ZIP code
gdf[['ZIP', 'Hispanic', 'White', 'AfricanAm', 'AAPI', 'NativeAm']].set_index('ZIP').plot(
    kind='bar', stacked=True, figsize=(10, 6), colormap='viridis')

plt.title('Racial Composition by ZIP Code')
plt.ylabel('Proportion of Population')
plt.legend(title='Racial Groups')
plt.xticks(rotation=90)
plt.show()
# Plot a statewide map of cumulative impact scores
gdf.plot(column='CIscore', legend=True)

plt.title('Cumulative Impact Scores Across California')
plt.show()