Data Visualization with Seaborn

 

Base on DataCamp.

Seaborn Cheat Sheet

Introduction to Seaborn

Scatter and Count Plot

# Making a scatter plot with lists

## Import Matplotlib and Seaborn
import matplotlib.pyplot as plt
import seaborn as sns

## Change this scatter plot to have percent literate on the y-axis
sns.scatterplot(x=gdp, y=percent_literate)

## Show plot
plt.show()

# Making a count plot with a list
## Create count plot with region on the y-axis
sns.countplot(y=region)

## Show plot
plt.show()

Using pandas with Seaborn

Making a count plot with a DataFrame

# Import Matplotlib, Pandas, and Seaborn
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Create a DataFrame from csv file
df = pd.read_csv(csv_filepath)

# Create a count plot with "Spiders" on the x-axis
sns.countplot(x="Spiders", data=df)

# Display the plot
plt.show()

Hue

Hue and Scatter Plots

# Change the legend order in the scatter plot
sns.scatterplot(x="absences", y="G3", 
                data=student_data, 
                hue="location",
                hue_order=["Rural","Urban"])

# Show plot
plt.show()

Hue and Count Plots

# Create a dictionary mapping subgroup values to colors
palette_colors = {"Rural": "green", "Urban": "blue"}

# Create a count plot of school with location subgroups
sns.countplot(x="school", data=student_data, hue="location", palette=palette_colors)

# Display plot
plt.show()

Visualizing Two Quantitative Variables

Relational Plots and Subplots

Creating subplots with col and row

# Change to make subplots based on study time
sns.relplot(x="absences", y="G3", 
            data=student_data,
            kind="scatter",
            col="study_time") # or raw

# Show plot
plt.show()

Creating two-factor subplots

# Adjust further to add subplots based on family support
sns.relplot(x="G1", y="G3", 
            data=student_data,
            kind="scatter", 
            col="schoolsup",
            col_order=['yes','no'],
            row="famsup",
            row_order=["yes", "no"])

# Show plot
plt.show()

Customizing scatter plots

Changing the size and style

# size
# Create scatter plot of horsepower vs. mpg
sns.relplot(x="horsepower", y="mpg", 
            data=mpg, kind="scatter", 
            size="cylinders",
            hue="cylinders")

# Show plot
plt.show()

# style
# Create a scatter plot of acceleration vs. mpg
sns.relplot(x="acceleration",y="mpg",
            data=mpg,
            kind='scatter',
            style="origin",
            hue="origin")

Line Plots

Interpreting line plots

# Create line plot
sns.relplot(x="model_year",y="mpg",
            data=mpg,
            kind='line')

# Show plot
plt.show()

Visualizing standard deviation

# Make the shaded area show the standard deviation
sns.relplot(x="model_year", y="mpg",
            data=mpg, kind="line", ci='sd')

# Show plot
plt.show()

Plotting subgroups

# Add markers and make each line have the same style
sns.relplot(x="model_year", y="horsepower", 
            data=mpg, kind="line", 
            ci=None, style="origin", 
            hue="origin", 
            markers=True, 
            dashes=False)

# Show plot
plt.show()

Visualizing a Categorical and a Quantitative Variable

Count Plots

# Create column subplots based on age category
sns.catplot(y="Internet usage", # Make the bars horizontal instead of vertical.
            data=survey_data,
            kind="count",
            col="Age Category") # column subplots 

# Show plot
plt.show()

Bar Plots

# Turn off the confidence intervals
sns.catplot(x="study_time", y="G3",
            data=student_data,
            kind="bar",
            # rearrange the categories so that they are in order from lowest study time to highest.
            order=["<2 hours", 
                   "2 to 5 hours", 
                   "5 to 10 hours", 
                   ">10 hours"], 
            ci=None) # no longer displays confidence intervals.

# Show plot
plt.show()

Box Plots

Create and interpret a box plot

# Specify the category ordering
study_time_order = ["<2 hours", "2 to 5 hours", 
                    "5 to 10 hours", ">10 hours"]

# Create a box plot and set the order of the categories
sns.catplot(x="study_time", y="G3", 
            data=student_data,
            kind='box',
            order=study_time_order)

Omitting outliers

# Create a box plot with subgroups and omit the outliers
sns.catplot(x="internet",y="G3",
            data=student_data,
            kind='box',
            hue="location",
            sym='') # Omitting outliers

Adjusting the whiskers

# Set the whiskers to 0.5 * IQR
sns.catplot(x="romantic", y="G3",
            data=student_data,
            kind="box",
            whis=0.5) # whis=[5,95]/whis=[0, 100]

# Show plot
plt.show()

Point plots

Customizing point plots

# Remove the lines joining the points
sns.catplot(x="famrel", y="absences",
			data=student_data,
            kind="point",
            # Add "caps" to the end of the confidence intervals with size 0.2
            capsize=0.2, 
            # Remove the lines joining the points in each category.
            join=False) 
            
# Show plot
plt.show()

Point plots with subgroups

# Import median function from numpy
from numpy import median

# Plot the median number of absences instead of the mean
sns.catplot(x="romantic", y="absences",
			data=student_data,
            kind="point",
            hue="school",
            ci=None,
            # display the median number of absences instead of the average
            estimator=median) 

# Show plot
plt.show()

Customizing Seaborn Plots

Changing style and palette

# Set the style to "whitegrid"
sns.set_style("whitegrid")

# Change the color palette to "RdBu"
sns.set_palette("RdBu")

Changing the scale

# Set the context to "paper"
# Option: "paper","notebook","talk","poster",the later the bigger.
sns.set_context("paper")

Using a custom palette

# Set the style to "darkgrid"
sns.set_style("darkgrid")

# Set a custom color palette
custom_color = ["#39A7D0","#36ADA4"]
sns.set_palette(custom_color)

# Create the box plot of age distribution by gender
sns.catplot(x="Gender", y="Age", 
            data=survey_data, kind="box")

# Show plot
plt.show()

Adding titles and labels

FacetGrid relplot(), catplot() Can create subplots
AxesSubplot relplot(), catplot() Can create subplots
AxesSubplot scatterplot(), countplot(),etc. Only creates a single plot

Adding a title to FacetGrid

g = sns.catplot(x="Region", 
                y="Birthrate",
                data=gdp_data,
                kind="box") 
# y: adjust height of title in FacetGrid
g.fig.suptitle("New Title", y=1.03) 
plt.show()

Adding a title to AxesSubplot

g = sns.boxplot(x="Region", 
                y="Birthrate",
                data=gdp_data,) 
g.set_title("New Title", y=1.03)
plt.show()

Titles for subplots

g = sns.boxplot(x="Region", 
                y="Birthrate",
                data=gdp_data,
                kind="box",
                col="Group") 
# set main title
g.fig.suptitle("New Title", y=1.03)

# set subtitles
g.set_titles("This is {col_name}")

plt.show()

Adding axis labels

g.set(xlabel="New X Label",
      ylabel="New Y Label")

Rotating x-axis tick labels

plt.xticks(rotation=90)

Application

Box plot with subgroups

# Set palette to "Blues"
sns.set_palette("Blues")

# Adjust to add subgroups based on "Interested in Pets"
g = sns.catplot(x="Gender",
                y="Age", data=survey_data, 
                kind="box", hue="Interested in Pets")

# Set title to "Age of Those Interested in Pets vs. Not"
g.fig.suptitle("Age of Those Interested in Pets vs. Not")

# Show plot
plt.show()

Bar plot with subgroups and subplots

# Set the figure style to "dark"
sns.set_style("dark")

# Adjust to add subplots per gender
g = sns.catplot(x="Village - town", y="Likes Techno", 
                data=survey_data, kind="bar",
                col="Gender")

# Add title and axis labels
g.fig.suptitle("Percentage of Young People Who Like Techno", y=1.02)
g.set(xlabel="Location of Residence", 
      ylabel="% Who Like Techno")

# Show plot
plt.show()

Intermediate Seaborn

Distribution Plot

Plot a histogram

# Create a distplot
sns.distplot(df['Award_Amount'],
            # disable the KDE to get histogram
             kde=False,
             bins=20)

# Display the plot
plt.show()

Rug plot and kde shading

# Create a distplot of the Award Amount
sns.distplot(df['Award_Amount'],
             hist=False,
             # Add a rug plot above the x axis
             rug=True, 
             # Configure it to show a shaded kde (using the kde_kws dictionary).
             kde_kws={'shade':True})

# Plot the results
plt.show()

Regression Plots in Seaborn

regplot()

# Create a regression plot of premiums vs. insurance_losses
sns.regplot(x="insurance_losses", y="premiums", data=df)

# Display the plot
plt.show()

lmplot()

# Create a regression plot using hue
sns.lmplot(data=df,
           x="insurance_losses",
           y="premiums",
           # Plot a regression line for each Region of the country.
           hue="Region",
           # Create a plot for each Region of the country.
           row="Region")

# Show the results
plt.show()

Customizing Distribution Plot

python matplotlib中axes与axis的区别

Using Seaborn Styles

Setting the default style

# Set the default seaborn style
sns.set()

# Plot the pandas histogram 
df['fmr_2'].plot.hist()
plt.show()
plt.clf()

Removing spines

# Set the style to white
sns.set_style('white')

# Create a regression plot
sns.lmplot(data=df,
           x='pop2010',
           y='fmr_2')

# Remove the spines
sns.despine(right=True)

# Show the plot and clear the figure
plt.show()
plt.clf()

Colors in Seaborn

Matplotlib color codes

# Set style, enable color code, and create a magenta distplot
sns.set(color_codes=True)
sns.distplot(df['fmr_3'], color='m')

# Show the plot
plt.show()

Using default palettes

  • Circular colors = when the data is not ordered
  • Sequential colors = when the data has a consistent range from high to low
  • Diverging colors = when both the low and high values are interesting
# Loop through differences between bright and colorblind palettes
for p in ['bright', 'colorblind']:
    sns.set_palette(p)
    sns.distplot(df['fmr_3'])
    plt.show()
    
    # Clear the plots    
    plt.clf()

Creating Custom Palettes

# Create and display a Purples sequential palette containing 8 colors.
sns.palplot(sns.color_palette( "Purples", 8))
plt.show()

# Create and display a palette with 10 colors using the husl system.
sns.palplot(sns.color_palette( "husl", 10))
plt.show()

# Create and display a diverging palette with 6 colors coolwarm.
sns.palplot(sns.color_palette( "coolwarm", 6))
plt.show()

Customizing with matplotlib

Using matplotlib axes

# Create a figure and axes
fig, ax = plt.subplots()

# Plot the distribution of data
sns.distplot(df['fmr_3'], ax=ax)

# Create a more descriptive x axis label
ax.set(xlabel="3 Bedroom Fair Market Rent")

# Show the plot
plt.show()

Additional plot customizations

# Create a figure and axes
fig, ax = plt.subplots()

# Plot the distribution of 1 bedroom rents
sns.distplot(df['fmr_1'], ax=ax)

# Modify the properties of the plot
ax.set(xlabel="1 Bedroom Fair Market Rent",
       # Change the x axis limits to be between 100 and 1500.
       xlim=(100,1500),
       # Add a descriptive title of "US Rent" to the plot.
       title="US Rent")

# Display the plot
plt.show()

Adding annotations

# Create a figure and axes. Then plot the data
fig, ax = plt.subplots()
sns.distplot(df['fmr_1'], ax=ax)

# Customize the labels and limits
ax.set(xlabel="1 Bedroom Fair Market Rent", xlim=(100,1500), title="US Rent")

# Add vertical lines for the median and mean
ax.axvline(x=median, color='m', label='Median', linestyle='--', linewidth=2)
ax.axvline(x=mean, color='b', label='Mean', linestyle='-', linewidth=2)

# Show the legend and plot the data
ax.legend()
plt.show()

Multiple plots

# Create a plot with 1 row and 2 columns that share the y axis label
fig, (ax0, ax1) = plt.subplots(nrows=1, ncols=2, sharey=True)

# Plot the distribution of 1 bedroom apartments on ax0
sns.distplot(df['fmr_1'], ax=ax0)
ax0.set(xlabel="1 Bedroom Fair Market Rent", xlim=(100,1500))

# Plot the distribution of 2 bedroom apartments on ax1
sns.distplot(df['fmr_2'], ax=ax1)
ax1.set(xlabel="2 Bedroom Fair Market Rent", xlim=(100,1500))

Additional Plot Types

Categorical Plot Types

stripplot() and swarmplot()

# Create the stripplot
sns.stripplot(data=df,
         x='Award_Amount',
         y='Model Selected',
         jitter=True)

plt.show()
# Create and display a swarmplot with hue set to the Region
sns.swarmplot(data=df,
         x='Award_Amount',
         y='Model Selected',
         hue='Region')

plt.show()

boxplots, violinplots and lvplots

# Create a boxplot
sns.boxplot(data=df,
         x='Award_Amount',
         y='Model Selected')

plt.show()
plt.clf()

# Create a violinplot with the husl palette
sns.violinplot(data=df,
         x='Award_Amount',
         y='Model Selected',
         palette='husl')

plt.show()
plt.clf()

# Create a lvplot with the Paired palette and the Region column as the hue
sns.lvplot(data=df,
         x='Award_Amount',
         y='Model Selected',
         palette='Paired',
         hue='Region')

plt.show()
plt.clf()

barplots, pointplots and countplots

# Show a countplot with the number of models used with each region a different color
sns.countplot(data=df,
         y="Model Selected",
         hue="Region")

plt.show()
plt.clf()

# Create a pointplot and include the capsize in order to show bars on the confidence interval
sns.pointplot(data=df,
         y='Award_Amount',
         x='Model Selected',
         # Use a capsize in the pointplot in order to show the confidence interval.
         capsize=.1)

plt.show()
plt.clf()

Regression Plots

Regression and residual plots

# Display a regression plot for Tuition
sns.regplot(data=df,
         y='Tuition',
         x="SAT_AVG_ALL",
         marker='^',
         color='g')

plt.show()
plt.clf()

# Display the residual plot
sns.residplot(data=df,
          y='Tuition',
          x="SAT_AVG_ALL",
          color='g')

plt.show()
plt.clf()

Regression plot parameters

# Plot a regression plot of Tuition and the Percentage of Pell Grants
sns.regplot(data=df,
            y='Tuition',
            x="PCTPELL",
            #  breaks the PCTPELL column into 5 different bins.
            x_bins=5,
            # using a 2nd order polynomial regression line
            order=2)

plt.show()
plt.clf()

Binning data

# Create a scatter plot by disabling the regression line
sns.regplot(data=df,
            y='Tuition',
            x="UG",
            # disable the regression line
            fit_reg=False) 

plt.show()
plt.clf()

# Create a regplot and bin the data into 8 bins
sns.regplot(data=df,
         y='Tuition',
         x="UG",
         x_bins=8)

plt.show()
plt.clf()

Matrix plots

# Create a crosstab table of the data
pd_crosstab = pd.crosstab(df["Group"], df["YEAR"])
print(pd_crosstab)

# Plot a heatmap of the table with no color bar and using the BuGn palette
sns.heatmap(pd_crosstab, cbar=False, cmap="BuGn", linewidths=0.3)

# Rotate tick marks for visibility
plt.yticks(rotation=0)
plt.xticks(rotation=90)

plt.show()

Creating Plots on Data Aware Grids

Using FacetGrid, factorplot and lmplot

Building a FacetGrid

# Create FacetGrid with Degree_Type and specify the order of the rows using row_order
g2 = sns.FacetGrid(df, 
             row="Degree_Type",
             row_order=['Graduate', 'Bachelors', 'Associates', 'Certificate'])

# Map a pointplot of SAT_AVG_ALL onto the grid
g2.map(sns.pointplot, 'SAT_AVG_ALL')

# Show the plot
plt.show()
plt.clf()

Using a factorplot

In many cases, Seaborn’s factorplot() can be a simpler way to create a FacetGrid. Instead of creating a grid and mapping the plot, we can use the factorplot() to create a plot with one line of code.

# Create a facetted pointplot of Average SAT_AVG_ALL scores facetted by Degree Type 
sns.factorplot(data=df,
        x='SAT_AVG_ALL',
        # shows a pointplot
        kind='point',
        row='Degree_Type',
        # Use row_order to order the degrees from highest to lowest level.
        row_order=['Graduate', 'Bachelors', 'Associates', 'Certificate'])

plt.show()
plt.clf()

Using a lmplot

The lmplot is used to plot scatter plots with regression lines on FacetGrid objects.

# Create a FacetGrid varying by column and columns ordered with the degree_order variable
g = sns.FacetGrid(df, col="Degree_Type", col_order=degree_ord)

# Map a scatter plot of Undergrad Population compared to PCTPELL
g.map(plt.scatter, 'UG', 'PCTPELL')

plt.show()
plt.clf()

# Re-create the plot above as an lmplot
sns.lmplot(data=df,
        x='UG',
        y='PCTPELL',
        col="Degree_Type",
        col_order=degree_ord)

plt.show()
plt.clf()

# Create an lmplot that has a column for Ownership, a row for Degree_Type and hue based on the WOMENONLY column
sns.lmplot(data=df,
        x='SAT_AVG_ALL',
        y='Tuition',
        col="Ownership",
        row='Degree_Type',
        row_order=['Graduate', 'Bachelors'],
        hue='WOMENONLY',
        col_order=inst_ord)

plt.show()
plt.clf()

Using PairGrid and pairplot

Building a PairGrid

# Create a PairGrid with a scatter plot for fatal_collisions and premiums
g = sns.PairGrid(df, vars=["fatal_collisions", "premiums"])
g2 = g.map(plt.scatter)

plt.show()
plt.clf()

# Create the same PairGrid but map a histogram on the diag
g = sns.PairGrid(df, vars=["fatal_collisions", "premiums"])
g2 = g.map_diag(plt.hist) #  plot a histogram on the diagonal
g3 = g2.map_offdiag(plt.scatter) # scatter plot on the off diagonal

plt.show()
plt.clf()

Using a pairplot

The pairplot() function is generally a more convenient way to look at pairwise relationships.

# Plot the same data but use a different color palette and color code by Region
sns.pairplot(data=df,
        vars=["fatal_collisions", "premiums"],
        kind='scatter',
        # using the "Region" to color code the results
        hue='Region',
        # Use the RdBu palette to change the colors of the plot
        palette='RdBu',
        diag_kws={'alpha':.5})

plt.show()
plt.clf()

Additional pairplots

# Build a pairplot with different x and y variables
sns.pairplot(data=df,
        # define the x_vars and y_vars that you wish to examine
        x_vars=["fatal_collisions_speeding", "fatal_collisions_alc"],
        y_vars=['premiums', 'insurance_losses'],
        kind='scatter',
        # Use the husl palette and color code the scatter plot by Region
        hue='Region',
        palette='husl')

plt.show()
plt.clf()

# plot relationships between insurance_losses and premiums
sns.pairplot(data=df,
             vars=["insurance_losses", "premiums"],
             # Use a reg plot for the the non-diagonal plots
             kind='reg',
             # Use the BrBG palette for the final plot.
             palette='BrBG',
             # diag_kind to control the types of plots shown on the diagonals.
             diag_kind = 'kde',
             hue='Region')

plt.show()
plt.clf()

Using JointGrid and jointplot

Building a JointGrid and jointplot

Seaborn’s JointGrid combines univariate plots such as histograms, rug plots and kde plots with bivariate plots such as scatter and regression plots.

# Build a JointGrid comparing humidity and total_rentals
sns.set_style("whitegrid") # Use Seaborn's "whitegrid" style for these plots.
g = sns.JointGrid(x="hum",
            y="total_rentals",
            data=df,
            xlim=(0.1, 1.0)) 

# Plot a regplot() and distplot() on the margins.
g.plot(sns.regplot, sns.distplot)

plt.show()
plt.clf()

# Create a jointplot similar to the JointGrid 
sns.jointplot(x="hum",
        y="total_rentals",
        kind='reg',
        data=df)

plt.show()
plt.clf()

Jointplots and regression

# Plot temp vs. total_rentals as a regression plot
sns.jointplot(x="temp",
         y="total_rentals",
         kind='reg',
         data=df,
         # 2nd order polynomial regression
         order=2,
         xlim=(0, 1))

plt.show()
plt.clf()

# Plot a jointplot showing the residuals to check the appropriateness of the model.
sns.jointplot(x="temp",
        y="total_rentals",
        kind='resid',
        data=df,
        order=2)

plt.show()
plt.clf()

Complex jointplots

The jointplot is a convenience wrapper around many of the JointGrid functions. However, it is possible to overlay some of the JointGrid plots on top of the standard jointplot

# Create a jointplot of temp vs. casual riders
# Include a kdeplot over the scatter plot
g = (sns.jointplot(x="temp",
             y="casual",
             kind='scatter',
             data=df,
             marginal_kws=dict(bins=10, rug=True))
    .plot_joint(sns.kdeplot)) # Overlay a kdeplot on top of the scatter plot.
    
plt.show()
plt.clf()