Cyclist%20logo.jpg

DA | Cyclistic Bike Sharing | Google Capstone¶

This repository unlocks the source code for the Cyclist Bike Share data analysis project. Get a behind-the-scenes look at how we analyzed cyclist usage patterns!

Ready to Shift Gears?

  • Source Code: Dive into the heart of the project by examining the code within this repository. It's your roadmap to understanding how we analyzed cyclist data.

  • Detailed Documentation: To fully grasp the project's goals, methodology, and results, explore our comprehensive documentation.

  • Blog Post: Seeking a concise overview of the project's key findings? Visit our blog post for a streamlined explanation.

  • Visit Our Site: Curious about other data-driven projects? Head over to our website for more!

  • Got Questions? Feel free to reach out!

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [2]:
import glob
import pandas as pd

# Use glob to find all CSV files in the directory
csv_files = glob.glob(r'C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\*.csv')

# Print the list of CSV files found
print(*csv_files, sep="\n")

# Concatenate all CSV files into a single DataFrame
cycle_data = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)
C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202201-divvy-tripdata.csv.csv
C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202202-divvy-tripdata.csv.csv
C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202203-divvy-tripdata.csv.csv
C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202204-divvy-tripdata.csv
C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202205-divvy-tripdata.csv
C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202206-divvy-tripdata.csv
C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202207-divvy-tripdata.csv
C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202208-divvy-tripdata.csv
C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202209-divvy-publictripdata.csv
C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202210-divvy-tripdata.csv
C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202211-divvy-tripdata.csv
C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202212-divvy-tripdata.csv
In [3]:
cycle_data.head()
Out[3]:
ride_id rideable_type started_at ended_at start_station_name start_station_id end_station_name end_station_id start_lat start_lng end_lat end_lng member_casual
0 C2F7DD78E82EC875 electric_bike 2022-01-13 11:59:47 2022-01-13 12:02:44 Glenwood Ave & Touhy Ave 525 Clark St & Touhy Ave RP-007 42.012800 -87.665906 42.012560 -87.674367 casual
1 A6CF8980A652D272 electric_bike 2022-01-10 08:41:56 2022-01-10 08:46:17 Glenwood Ave & Touhy Ave 525 Clark St & Touhy Ave RP-007 42.012763 -87.665967 42.012560 -87.674367 casual
2 BD0F91DFF741C66D classic_bike 2022-01-25 04:53:40 2022-01-25 04:58:01 Sheffield Ave & Fullerton Ave TA1306000016 Greenview Ave & Fullerton Ave TA1307000001 41.925602 -87.653708 41.925330 -87.665800 member
3 CBB80ED419105406 classic_bike 2022-01-04 00:18:04 2022-01-04 00:33:00 Clark St & Bryn Mawr Ave KA1504000151 Paulina St & Montrose Ave TA1309000021 41.983593 -87.669154 41.961507 -87.671387 casual
4 DDC963BFDDA51EEA classic_bike 2022-01-20 01:31:10 2022-01-20 01:37:12 Michigan Ave & Jackson Blvd TA1309000002 State St & Randolph St TA1305000029 41.877850 -87.624080 41.884621 -87.627834 member
In [4]:
# Use head() to display the first few rows of the DataFrame
print(cycle_data.head())

print(cycle_data.info())

print(cycle_data.describe())

# Check for missing values in each column
print(cycle_data.isnull().sum())

# Check data types of columns
print(cycle_data.dtypes)
            ride_id  rideable_type           started_at             ended_at  \
0  C2F7DD78E82EC875  electric_bike  2022-01-13 11:59:47  2022-01-13 12:02:44   
1  A6CF8980A652D272  electric_bike  2022-01-10 08:41:56  2022-01-10 08:46:17   
2  BD0F91DFF741C66D   classic_bike  2022-01-25 04:53:40  2022-01-25 04:58:01   
3  CBB80ED419105406   classic_bike  2022-01-04 00:18:04  2022-01-04 00:33:00   
4  DDC963BFDDA51EEA   classic_bike  2022-01-20 01:31:10  2022-01-20 01:37:12   

              start_station_name start_station_id  \
0       Glenwood Ave & Touhy Ave              525   
1       Glenwood Ave & Touhy Ave              525   
2  Sheffield Ave & Fullerton Ave     TA1306000016   
3       Clark St & Bryn Mawr Ave     KA1504000151   
4    Michigan Ave & Jackson Blvd     TA1309000002   

                end_station_name end_station_id  start_lat  start_lng  \
0           Clark St & Touhy Ave         RP-007  42.012800 -87.665906   
1           Clark St & Touhy Ave         RP-007  42.012763 -87.665967   
2  Greenview Ave & Fullerton Ave   TA1307000001  41.925602 -87.653708   
3      Paulina St & Montrose Ave   TA1309000021  41.983593 -87.669154   
4         State St & Randolph St   TA1305000029  41.877850 -87.624080   

     end_lat    end_lng member_casual  
0  42.012560 -87.674367        casual  
1  42.012560 -87.674367        casual  
2  41.925330 -87.665800        member  
3  41.961507 -87.671387        casual  
4  41.884621 -87.627834        member  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5667717 entries, 0 to 5667716
Data columns (total 13 columns):
 #   Column              Dtype  
---  ------              -----  
 0   ride_id             object 
 1   rideable_type       object 
 2   started_at          object 
 3   ended_at            object 
 4   start_station_name  object 
 5   start_station_id    object 
 6   end_station_name    object 
 7   end_station_id      object 
 8   start_lat           float64
 9   start_lng           float64
 10  end_lat             float64
 11  end_lng             float64
 12  member_casual       object 
dtypes: float64(4), object(9)
memory usage: 562.1+ MB
None
          start_lat     start_lng       end_lat       end_lng
count  5.667717e+06  5.667717e+06  5.661859e+06  5.661859e+06
mean   4.190222e+01 -8.764783e+01  4.190242e+01 -8.764790e+01
std    4.626109e-02  2.999925e-02  6.805821e-02  1.082985e-01
min    4.164000e+01 -8.784000e+01  0.000000e+00 -8.814000e+01
25%    4.188103e+01 -8.766154e+01  4.188103e+01 -8.766260e+01
50%    4.190000e+01 -8.764410e+01  4.190000e+01 -8.764414e+01
75%    4.193000e+01 -8.762957e+01  4.193000e+01 -8.762963e+01
max    4.563503e+01 -7.379648e+01  4.237000e+01  0.000000e+00
ride_id                    0
rideable_type              0
started_at                 0
ended_at                   0
start_station_name    833064
start_station_id      833064
end_station_name      892742
end_station_id        892742
start_lat                  0
start_lng                  0
end_lat                 5858
end_lng                 5858
member_casual              0
dtype: int64
ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object
In [5]:
# Handle missing values - for example, drop rows with missing values
cycle_data.dropna(inplace=True)

cycle_data.columns = cycle_data.columns.str.replace(' ', '_').str.lower()

print(cycle_data.columns)

# Assuming the actual column names are 'started_at' and 'ended_at'
cycle_data['started_at'] = pd.to_datetime(cycle_data['started_at'])
cycle_data['ended_at'] = pd.to_datetime(cycle_data['ended_at'])
Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual'],
      dtype='object')
In [6]:
cycle_data['ride_duration_minutes'] = (cycle_data['ended_at'] - cycle_data['started_at']).dt.seconds / 60
In [7]:
# Perform exploratory data analysis (EDA)
average_ride_duration = (cycle_data['ended_at'] - cycle_data['started_at']).mean()
print("Average ride duration:", average_ride_duration)
Average ride duration: 0 days 00:17:05.710490552
In [9]:
# Count the number of rides for each user type
ride_count_by_user_type = cycle_data['member_casual'].value_counts()

print("Ride count by user type:")
print(ride_count_by_user_type)
Ride count by user type:
member    2611171
casual    1758189
Name: member_casual, dtype: int64
In [10]:
# Extract the hour from the 'started_at' timestamp
cycle_data['start_hour'] = cycle_data['started_at'].dt.hour

ride_count_by_hour = cycle_data['start_hour'].value_counts()

print("Peak hours for rides:")
print(ride_count_by_hour)
Peak hours for rides:
17    451135
16    384582
18    373986
15    307606
19    273653
14    263969
13    257552
12    255740
11    219678
8     219256
20    193556
7     181117
10    176532
9     167054
21    156609
22    127787
6      97554
23     87524
0      58535
1      36905
5      34501
2      21266
3      12443
4      10820
Name: start_hour, dtype: int64
In [11]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create a box plot for ride duration by user type
plt.figure(figsize=(10, 6))
sns.boxplot(x='member_casual', y=(cycle_data['ended_at'] - cycle_data['started_at']).dt.seconds / 60, 
            data=cycle_data, palette='pastel')
plt.xlabel('User Type')
plt.ylabel('Ride Duration (minutes)')
plt.title('Distribution of Ride Durations by User Type')
plt.ylim(0, 150)  
plt.show()
In [12]:
import folium
from folium.plugins import HeatMap
from IPython.display import display

# Define the latitude and longitude coordinates for the center of the map
latitude_center = 41.8781  # Example latitude
longitude_center = -87.6298  # Example longitude


# Create a map centered around a specific location which is chiago actual riding points
m = folium.Map(location=[41.8781,-87.6298], zoom_start=12)

start_locations = cycle_data[['start_lat', 'start_lng']].values.tolist()

end_locations = cycle_data[['end_lat', 'end_lng']].values.tolist()

HeatMap(start_locations).add_to(m)
HeatMap(end_locations).add_to(m)

# Save the map to an HTML file
m.save('start_end_heatmap.html')
In [13]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

cycle_data['started_at'] = pd.to_datetime(cycle_data['started_at'])

ride_count_by_day = cycle_data.groupby(pd.Grouper(key='started_at', freq='D')).size()

# Create a line plot
plt.figure(figsize=(12, 6))
plt.plot(ride_count_by_day.index, ride_count_by_day.values, color='#4CAF50', linewidth=2, marker='o', markersize=6, markerfacecolor='#FF5722', markeredgecolor='none')

plt.xlabel('Date', fontsize=14, fontweight='bold')
plt.ylabel('Ride Count', fontsize=14, fontweight='bold')
plt.title('Ride Count Over Time (Daily)', fontsize=16, fontweight='bold')

plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
plt.gca().xaxis.set_minor_locator(mdates.DayLocator(interval=5))

plt.xticks(rotation=45, ha='right')

plt.grid(True, linestyle='--', alpha=0.7)

plt.legend(['Ride Count'], loc='upper left')

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.tight_layout()

plt.show()
In [14]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

cycle_data['started_at'] = pd.to_datetime(cycle_data['started_at'])

ride_count_by_week = cycle_data.groupby(pd.Grouper(key='started_at', freq='W')).size()

plt.figure(figsize=(12, 6))
plt.plot(ride_count_by_week.index, ride_count_by_week.values, color='#4CAF50', linewidth=2, marker='o', markersize=6, markerfacecolor='#FF5722', markeredgecolor='none')

plt.xlabel('Week', fontsize=14, fontweight='bold')
plt.ylabel('Ride Count', fontsize=14, fontweight='bold')
plt.title('Ride Count Over Time (Weekly)', fontsize=16, fontweight='bold')

plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
plt.gca().xaxis.set_minor_locator(mdates.WeekdayLocator())

plt.xticks(rotation=45, ha='right')

plt.grid(True, linestyle='--', alpha=0.7)

plt.legend(['Ride Count'], loc='upper left')

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.tight_layout()

plt.show()
In [15]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

cycle_data['started_at'] = pd.to_datetime(cycle_data['started_at'])

ride_count_by_month = cycle_data.groupby(pd.Grouper(key='started_at', freq='M')).size()

plt.figure(figsize=(12, 6))
plt.plot(ride_count_by_month.index, ride_count_by_month.values, color='#4CAF50', linewidth=2, marker='o', markersize=6, markerfacecolor='#FF5722', markeredgecolor='none')

plt.xlabel('Month', fontsize=14, fontweight='bold')
plt.ylabel('Ride Count', fontsize=14, fontweight='bold')
plt.title('Monthly Ride Count', fontsize=16, fontweight='bold')

plt.xticks(rotation=45, ha='right')

plt.grid(True, linestyle='--', alpha=0.7)

plt.tight_layout()

plt.show()
In [16]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

cycle_data['started_at'] = pd.to_datetime(cycle_data['started_at'])

ride_count_by_day = cycle_data.groupby(pd.Grouper(key='started_at', freq='D')).size()

plt.figure(figsize=(12, 12))
plt.subplot(3, 2, (1, 2))
plt.plot(ride_count_by_day.index, ride_count_by_day.values, color='#4CAF50', linewidth=2, marker='o', markersize=6, markerfacecolor='#FF5722', markeredgecolor='none')
plt.xlabel('Date', fontsize=14, fontweight='bold')
plt.ylabel('Ride Count', fontsize=14, fontweight='bold')
plt.title('Ride Count Over Time (Daily)', fontsize=16, fontweight='bold')
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
plt.gca().xaxis.set_minor_locator(mdates.DayLocator(interval=5))
plt.xticks(rotation=45, ha='right')
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(['Ride Count'], loc='upper left')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.tight_layout()

ride_count_by_week = cycle_data.groupby(pd.Grouper(key='started_at', freq='W')).size()

plt.subplot(3, 2, 3)
plt.plot(ride_count_by_week.index, ride_count_by_week.values, color='#4CAF50', linewidth=2, marker='o', markersize=6, markerfacecolor='#FF5722', markeredgecolor='none')
plt.xlabel('Week', fontsize=14, fontweight='bold')
plt.ylabel('Ride Count', fontsize=14, fontweight='bold')
plt.title('Ride Count Over Time (Weekly)', fontsize=16, fontweight='bold')
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
plt.gca().xaxis.set_minor_locator(mdates.WeekdayLocator())
plt.xticks(rotation=45, ha='right')
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(['Ride Count'], loc='upper left')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.tight_layout()

ride_count_by_month = cycle_data.groupby(pd.Grouper(key='started_at', freq='M')).size()

plt.subplot(3, 2, 4)
plt.plot(ride_count_by_month.index, ride_count_by_month.values, color='#4CAF50', linewidth=2, marker='o', markersize=6, markerfacecolor='#FF5722', markeredgecolor='none')
plt.xlabel('Month', fontsize=14, fontweight='bold')
plt.ylabel('Ride Count', fontsize=14, fontweight='bold')
plt.title('Monthly Ride Count', fontsize=16, fontweight='bold')
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
plt.xticks(rotation=45, ha='right')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()

plt.show()
In [17]:
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

#Let's try fraunces font, to match the visualization to our blog page...
fraunces_font = fm.FontProperties(family='Fraunces')

cycle_data['hour'] = cycle_data['started_at'].dt.hour

ride_count_by_hour = cycle_data.groupby('hour').size()

mean_ride_count = ride_count_by_hour.mean()
max_ride_count = ride_count_by_hour.max()

plt.figure(figsize=(12, 8))
ride_count_by_hour.plot(kind='bar', color='#4F6272', edgecolor='#1C3144', linewidth=1.5, alpha=0.8) 

plt.xlabel('Hour of the Day', fontsize=16, fontweight='bold', fontproperties=fraunces_font)
plt.ylabel('Number of Rides', fontsize=16, fontweight='bold', fontproperties=fraunces_font)
plt.title('Peak Ride Hours', fontsize=18, fontweight='bold', fontproperties=fraunces_font)

plt.axhline(y=mean_ride_count, color='red', linestyle='--', linewidth=1.5)
plt.text(0, mean_ride_count + 10, f'Mean: {mean_ride_count:.0f} rides', color='red', fontsize=12, fontweight='bold', fontproperties=fraunces_font)
plt.text(0, max_ride_count + 10, f'Max: {max_ride_count:.0f} rides', color='blue', fontsize=12, fontweight='bold', fontproperties=fraunces_font)

plt.xticks(rotation=0, fontsize=14, fontproperties=fraunces_font)
plt.yticks(fontsize=14, fontproperties=fraunces_font)
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.legend(['Ride Count', 'Mean Ride Count'], loc='upper right', fontsize=12, prop=fraunces_font)

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.tight_layout()

plt.show()
findfont: Font family ['Fraunces'] not found. Falling back to DejaVu Sans.
findfont: Font family ['Fraunces'] not found. Falling back to DejaVu Sans.
findfont: Font family ['Fraunces'] not found. Falling back to DejaVu Sans.
findfont: Font family ['Fraunces'] not found. Falling back to DejaVu Sans.
findfont: Font family ['Fraunces'] not found. Falling back to DejaVu Sans.
In [18]:
pip install geopy
Requirement already satisfied: geopy in c:\users\jatin gagwani\anaconda3\lib\site-packages (2.4.1)
Requirement already satisfied: geographiclib<3,>=1.52 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from geopy) (2.0)
Note: you may need to restart the kernel to use updated packages.
In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

cycle_data['start_hour'] = cycle_data['started_at'].dt.hour
cycle_data['day_of_week'] = cycle_data['started_at'].dt.day_name()

pivot_table = cycle_data.pivot_table(index='start_hour', columns='day_of_week', aggfunc='size')

plt.figure(figsize=(12, 6))
sns.heatmap(pivot_table, cmap='coolwarm', linewidths=0.5)
plt.title('Ride Start Time Distribution by Hour and Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Start Hour')
plt.show()
In [20]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

plt.figure(figsize=(12, 8))  # Increase figure size for better visualization
sns.histplot(data=cycle_data, x='day_of_week', hue='member_casual', multiple='stack', palette='coolwarm')  
plt.title('Ride Frequency by Day of Week', fontsize=20, fontweight='bold', color='navy')  
plt.xlabel('Day of Week', fontsize=16, fontweight='bold', color='darkslategray')  
plt.ylabel('Frequency', fontsize=16, fontweight='bold', color='darkslategray')  
plt.legend(labels=['Casual', 'Member'], title='User Type', fontsize=14, title_fontsize=14, labelcolor=['orange', 'royalblue'])
plt.xticks(rotation=45, fontsize=14, color='dimgray')  
plt.yticks(fontsize=14, color='dimgray')  
plt.tight_layout()  
plt.show()
In [21]:
import pandas as pd
cycle_data['day_of_week'] = cycle_data['started_at'].dt.day_name()
In [22]:
import pandas as pd
import matplotlib.pyplot as plt

cycle_data['month'] = cycle_data['started_at'].dt.month

def get_season(month):
    if month in [3, 4, 5]:  
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Autumn'
    else:  
        return 'Winter'

cycle_data['season'] = cycle_data['month'].apply(get_season)

ride_count_by_season = cycle_data.groupby('season').size().reset_index(name='ride_count')

colors = ['#FFB6C1', '#ADD8E6', '#90EE90', '#FFD700']

plt.figure(figsize=(10, 6))
bars = plt.bar(ride_count_by_season['season'], ride_count_by_season['ride_count'], color=colors)

for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, yval + 10, round(yval, 2), va='bottom', ha='center', fontsize=12)

plt.title('Ride Count by Season', fontsize=18, fontweight='bold', color='#333333')
plt.xlabel('Season', fontsize=14, fontweight='bold', color='#666666')
plt.ylabel('Ride Count', fontsize=14, fontweight='bold', color='#666666')
plt.xticks(fontsize=12, color='#666666')
plt.yticks(fontsize=12, color='#666666')
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.legend(ride_count_by_season['season'], loc='upper right')

plt.tight_layout()
plt.show()
In [23]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np

cycle_data['month'] = cycle_data['started_at'].dt.month

def get_season(month):
    if month in [3, 4, 5]:  
        return 'Spring'
    elif month in [6, 7, 8]:  
        return 'Summer'
    elif month in [9, 10, 11]:  
        return 'Autumn'
    else:  
        return 'Winter'

cycle_data['season'] = cycle_data['month'].apply(get_season)

ride_count_by_season = cycle_data.groupby('season').size().reset_index(name='ride_count')

colors = ['#FF6347', '#FFD700', '#00FF00', '#1E90FF']
cmap = mcolors.LinearSegmentedColormap.from_list("mycmap", colors)


plt.figure(figsize=(10, 8))
wedges, texts, autotexts = plt.pie(ride_count_by_season['ride_count'], labels=ride_count_by_season['season'],
                                    autopct='%1.1f%%', startangle=140, colors=cmap(np.linspace(0, 1, len(colors))),
                                    wedgeprops=dict(width=0.5, edgecolor='w'), shadow=True)

plt.setp(autotexts, size=12, weight="bold", color="white")

plt.title('Distribution of Rides by Season', fontsize=20, fontweight='bold', color='#333333', pad=20)

plt.axis('equal') 

plt.tight_layout()
plt.show()
In [24]:
import matplotlib.pyplot as plt

ride_count_by_type = cycle_data.groupby(['rideable_type', 'member_casual']).size().unstack(fill_value=0)

num_rideable_types = len(ride_count_by_type)

num_rows = 1
num_cols = num_rideable_types

fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 6))

for i, (rideable_type, counts) in enumerate(ride_count_by_type.iterrows()):
    ax = axes[i] if num_rideable_types > 1 else axes
    ax.pie(counts, labels=ride_count_by_type.columns, autopct='%1.1f%%', startangle=90, colors=colors,
           wedgeprops=dict(width=0.3), textprops=dict(fontsize=12))
    ax.set_title(f'Distribution of Rides by User Type ({rideable_type})', fontsize=14, pad=20)

plt.tight_layout()
plt.show()
In [25]:
import matplotlib.pyplot as plt

ride_count_by_user_type = cycle_data['member_casual'].value_counts()

avg_duration_by_user_type = cycle_data.groupby('member_casual')['ride_duration_minutes'].mean()

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

axes[0].bar(ride_count_by_user_type.index, ride_count_by_user_type.values, color=['blue', 'orange'])
axes[0].set_xlabel('User Type', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Ride Count', fontsize=12, fontweight='bold')
axes[0].set_title('Ride Frequency by User Type', fontsize=14, fontweight='bold')

for i, count in enumerate(ride_count_by_user_type.values):
    axes[0].text(i, count + 50, str(count), ha='center', va='bottom', fontsize=10)

axes[1].bar(avg_duration_by_user_type.index, avg_duration_by_user_type.values, color=['blue', 'orange'])
axes[1].set_xlabel('User Type', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Average Ride Duration (minutes)', fontsize=12, fontweight='bold')
axes[1].set_title('Average Ride Duration by User Type', fontsize=14, fontweight='bold')

for i, duration in enumerate(avg_duration_by_user_type.values):
    axes[1].text(i, duration + 5, f'{duration:.2f}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()
In [26]:
import pandas as pd
import matplotlib.pyplot as plt

ride_count_by_day_casual = cycle_data[cycle_data['member_casual'] == 'casual'].groupby(cycle_data['started_at'].dt.date).size()
ride_count_by_day_member = cycle_data[cycle_data['member_casual'] == 'member'].groupby(cycle_data['started_at'].dt.date).size()

plt.figure(figsize=(10, 6))
plt.plot(ride_count_by_day_casual.index, ride_count_by_day_casual.values, label='Casual', color='orange')
plt.plot(ride_count_by_day_member.index, ride_count_by_day_member.values, label='Member', color='royalblue')
plt.title("Ride Frequency Over Time")
plt.xlabel("Month")
plt.ylabel("Ride Count")
plt.legend()
plt.grid(True)
plt.show()

ride_count_by_user_type = cycle_data['member_casual'].value_counts()

avg_duration_by_user_type = cycle_data.groupby('member_casual')['ride_duration_minutes'].mean()

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

axes[0].bar(ride_count_by_user_type.index, ride_count_by_user_type.values, color=['blue', 'orange'])
axes[0].set_xlabel('User Type', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Ride Count', fontsize=12, fontweight='bold')
axes[0].set_title('Ride Frequency by User Type', fontsize=14, fontweight='bold')

for i, count in enumerate(ride_count_by_user_type.values):
    axes[0].text(i, count + 50, str(count), ha='center', va='bottom', fontsize=10)

axes[1].bar(avg_duration_by_user_type.index, avg_duration_by_user_type.values, color=['blue', 'orange'])
axes[1].set_xlabel('User Type', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Average Ride Duration (minutes)', fontsize=12, fontweight='bold')
axes[1].set_title('Average Ride Duration by User Type', fontsize=14, fontweight='bold')

for i, duration in enumerate(avg_duration_by_user_type.values):
    axes[1].text(i, duration + 5, f'{duration:.2f}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

ride_count_by_type = cycle_data.groupby(['rideable_type', 'member_casual']).size().unstack(fill_value=0)

num_rideable_types = len(ride_count_by_type)

num_rows = 1
num_cols = num_rideable_types

fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 6))

for i, (rideable_type, counts) in enumerate(ride_count_by_type.iterrows()):
    ax = axes[i] if num_rideable_types > 1 else axes
    ax.pie(counts, labels=ride_count_by_type.columns, autopct='%1.1f%%', startangle=90,
           wedgeprops=dict(width=0.3), textprops=dict(fontsize=12))
    ax.set_title(f'Distribution of Rides by User Type ({rideable_type})', fontsize=14, pad=20)

plt.tight_layout()

plt.show()
In [27]:
import pandas as pd
import matplotlib.pyplot as plt

colors = ['#1f77b4', '#ff7f0e']

ride_count_by_day_casual = cycle_data[cycle_data['member_casual'] == 'casual'].groupby(cycle_data['started_at'].dt.date).size()
ride_count_by_day_member = cycle_data[cycle_data['member_casual'] == 'member'].groupby(cycle_data['started_at'].dt.date).size()

plt.figure(figsize=(10, 6))
plt.plot(ride_count_by_day_casual.index, ride_count_by_day_casual.values, label='Casual', color=colors[0])
plt.plot(ride_count_by_day_member.index, ride_count_by_day_member.values, label='Member', color=colors[1])
plt.title("Ride Frequency Over Time")
plt.xlabel("Month")
plt.ylabel("Ride Count")
plt.legend()
plt.grid(True)
plt.show()

ride_count_by_user_type = cycle_data['member_casual'].value_counts()

avg_duration_by_user_type = cycle_data.groupby('member_casual')['ride_duration_minutes'].mean()

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

axes[0].bar(ride_count_by_user_type.index, ride_count_by_user_type.values, color=colors)
axes[0].set_xlabel('User Type', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Ride Count', fontsize=12, fontweight='bold')
axes[0].set_title('Ride Frequency by User Type', fontsize=14, fontweight='bold')

for i, count in enumerate(ride_count_by_user_type.values):
    axes[0].text(i, count + 50, str(count), ha='center', va='bottom', fontsize=10)

axes[1].bar(avg_duration_by_user_type.index, avg_duration_by_user_type.values, color=colors)
axes[1].set_xlabel('User Type', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Average Ride Duration (minutes)', fontsize=12, fontweight='bold')
axes[1].set_title('Average Ride Duration by User Type', fontsize=14, fontweight='bold')

for i, duration in enumerate(avg_duration_by_user_type.values):
    axes[1].text(i, duration + 5, f'{duration:.2f}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

ride_count_by_type = cycle_data.groupby(['rideable_type', 'member_casual']).size().unstack(fill_value=0)

num_rideable_types = len(ride_count_by_type)

num_rows = 1
num_cols = num_rideable_types

fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 6))

for i, (rideable_type, counts) in enumerate(ride_count_by_type.iterrows()):
    ax = axes[i] if num_rideable_types > 1 else axes
    ax.pie(counts, labels=ride_count_by_type.columns, autopct='%1.1f%%', startangle=90,
           wedgeprops=dict(width=0.3), textprops=dict(fontsize=12), colors=colors)
    ax.set_title(f'Distribution of Rides by User Type ({rideable_type})', fontsize=14, pad=20)

plt.tight_layout()

plt.show()
In [28]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

colors = ['#1f77b4', '#ff7f0e']

ride_count_by_day_casual = cycle_data[cycle_data['member_casual'] == 'casual'].groupby(cycle_data['started_at'].dt.date).size()
ride_count_by_day_member = cycle_data[cycle_data['member_casual'] == 'member'].groupby(cycle_data['started_at'].dt.date).size()

plt.figure(figsize=(10, 6))
plt.plot(ride_count_by_day_casual.index, ride_count_by_day_casual.values, label='Casual', color=colors[0])
plt.plot(ride_count_by_day_member.index, ride_count_by_day_member.values, label='Member', color=colors[1])
plt.title("Ride Frequency Over Time")
plt.xlabel("Month")
plt.ylabel("Ride Count")
plt.legend()
plt.grid(True)
plt.show()

ride_count_by_user_type = cycle_data['member_casual'].value_counts()

avg_duration_by_user_type = cycle_data.groupby('member_casual')['ride_duration_minutes'].mean()

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

axes[0].bar(ride_count_by_user_type.index, ride_count_by_user_type.values, color=colors)
axes[0].set_xlabel('User Type', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Ride Count', fontsize=12, fontweight='bold')
axes[0].set_title('Ride Frequency by User Type', fontsize=14, fontweight='bold')

for i, count in enumerate(ride_count_by_user_type.values):
    axes[0].text(i, count + 50, str(count), ha='center', va='bottom', fontsize=10)

axes[1].bar(avg_duration_by_user_type.index, avg_duration_by_user_type.values, color=colors)
axes[1].set_xlabel('User Type', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Average Ride Duration (minutes)', fontsize=12, fontweight='bold')
axes[1].set_title('Average Ride Duration by User Type', fontsize=14, fontweight='bold')

for i, duration in enumerate(avg_duration_by_user_type.values):
    axes[1].text(i, duration + 5, f'{duration:.2f}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

ride_count_by_type = cycle_data.groupby(['rideable_type', 'member_casual']).size().unstack(fill_value=0)

num_rideable_types = len(ride_count_by_type)

num_rows = 1
num_cols = num_rideable_types

fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 6))

for i, (rideable_type, counts) in enumerate(ride_count_by_type.iterrows()):
    ax = axes[i] if num_rideable_types > 1 else axes
    ax.pie(counts, labels=ride_count_by_type.columns, autopct='%1.1f%%', startangle=90,
           wedgeprops=dict(width=0.3), textprops=dict(fontsize=12), colors=colors)
    ax.set_title(f'Distribution of Rides by User Type ({rideable_type})', fontsize=14, pad=20)

plt.tight_layout()

plt.figure(figsize=(12, 8))  
sns.histplot(data=cycle_data, x='day_of_week', hue='member_casual', multiple='stack', palette='coolwarm')  
plt.title('Ride Frequency by Day of Week', fontsize=20, fontweight='bold', color='navy')  
plt.xlabel('Day of Week', fontsize=16, fontweight='bold', color='darkslategray')  
plt.ylabel('Frequency', fontsize=16, fontweight='bold', color='darkslategray')  
plt.legend(labels=['Casual', 'Member'], title='User Type', fontsize=14, title_fontsize=14, labelcolor=['orange', 'royalblue']) 
plt.xticks(rotation=45, fontsize=14, color='dimgray')  
plt.yticks(fontsize=14, color='dimgray')  
plt.tight_layout()  
plt.show()
In [29]:
pip install dash
Requirement already satisfied: dash in c:\users\jatin gagwani\anaconda3\lib\site-packages (2.16.1)Note: you may need to restart the kernel to use updated packages.

Requirement already satisfied: Flask<3.1,>=1.0.4 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (1.1.2)
Requirement already satisfied: plotly>=5.0.0 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (5.9.0)
Requirement already satisfied: retrying in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (1.3.4)
Requirement already satisfied: typing-extensions>=4.1.1 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (4.3.0)
Requirement already satisfied: setuptools in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (63.4.1)
Requirement already satisfied: dash-table==5.0.0 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (5.0.0)
Requirement already satisfied: Werkzeug<3.1 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (2.0.3)
Requirement already satisfied: importlib-metadata in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (4.11.3)
Requirement already satisfied: dash-core-components==2.0.0 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (2.0.0)
Requirement already satisfied: nest-asyncio in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (1.5.5)
Requirement already satisfied: requests in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (2.28.1)
Requirement already satisfied: dash-html-components==2.0.0 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (2.0.0)
Requirement already satisfied: itsdangerous>=0.24 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from Flask<3.1,>=1.0.4->dash) (2.0.1)
Requirement already satisfied: click>=5.1 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from Flask<3.1,>=1.0.4->dash) (8.0.4)
Requirement already satisfied: Jinja2>=2.10.1 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from Flask<3.1,>=1.0.4->dash) (3.1.3)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from plotly>=5.0.0->dash) (8.0.1)
Requirement already satisfied: zipp>=0.5 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from importlib-metadata->dash) (3.8.0)
Requirement already satisfied: charset-normalizer<3,>=2 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from requests->dash) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from requests->dash) (3.3)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from requests->dash) (2024.2.2)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from requests->dash) (1.26.11)
Requirement already satisfied: six>=1.7.0 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from retrying->dash) (1.16.0)
Requirement already satisfied: colorama in c:\users\jatin gagwani\anaconda3\lib\site-packages (from click>=5.1->Flask<3.1,>=1.0.4->dash) (0.4.5)
Requirement already satisfied: MarkupSafe>=2.0 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from Jinja2>=2.10.1->Flask<3.1,>=1.0.4->dash) (2.0.1)
In [30]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(rows=3, cols=2, subplot_titles=("Ride Frequency by User Type",
                                                     "Average Ride Duration by User Type",
                                                     "Popular Starting Locations",
                                                     "Popular Ending Locations",
                                                     "Ride Duration Distribution by User Type",
                                                     "Ride Frequency Over Time"))

ride_count_by_user_type = cycle_data['member_casual'].value_counts()

fig.add_trace(go.Bar(x=ride_count_by_user_type.index, y=ride_count_by_user_type.values, 
                     marker_color=['blue', 'orange']), row=1, col=1)
fig.update_xaxes(title_text="User Type", row=1, col=1)
fig.update_yaxes(title_text="Ride Count", row=1, col=1)

avg_duration_by_user_type = cycle_data.groupby('member_casual')['ride_duration_minutes'].mean()

fig.add_trace(go.Bar(x=avg_duration_by_user_type.index, y=avg_duration_by_user_type.values, 
                     marker_color=['blue', 'orange']), row=1, col=2)
fig.update_xaxes(title_text="User Type", row=1, col=2)
fig.update_yaxes(title_text="Average Ride Duration (minutes)", row=1, col=2)

starting_locations = cycle_data['start_station_name'].value_counts().nlargest(5)
fig.add_trace(go.Bar(x=starting_locations.index, y=starting_locations.values, 
                     marker_color='green'), row=2, col=1)
fig.update_xaxes(title_text="Start Station", row=2, col=1)
fig.update_yaxes(title_text="Frequency", row=2, col=1)

ending_locations = cycle_data['end_station_name'].value_counts().nlargest(5)
fig.add_trace(go.Bar(x=ending_locations.index, y=ending_locations.values, 
                     marker_color='green'), row=2, col=2)
fig.update_xaxes(title_text="End Station", row=2, col=2)
fig.update_yaxes(title_text="Frequency", row=2, col=2)

fig.add_trace(go.Histogram(x=cycle_data[cycle_data['member_casual'] == 'member']['ride_duration_minutes'], 
                            name='Member', marker_color='blue', opacity=0.7), row=3, col=1)
fig.add_trace(go.Histogram(x=cycle_data[cycle_data['member_casual'] == 'casual']['ride_duration_minutes'], 
                            name='Casual', marker_color='orange', opacity=0.7), row=3, col=1)
fig.update_xaxes(title_text="Ride Duration (minutes)", row=3, col=1)
fig.update_yaxes(title_text="Frequency", row=3, col=1)

ride_count_by_day = cycle_data.groupby(cycle_data['started_at'].dt.date).size()

fig.add_trace(go.Scatter(x=ride_count_by_day.index, y=ride_count_by_day.values, 
                         mode='lines', marker_color='purple'), row=3, col=2)
fig.update_xaxes(title_text="Date", row=3, col=2)
fig.update_yaxes(title_text="Ride Count", row=3, col=2)

fig.update_layout(title_text="Combined Dashboard")

fig.write_html("combined_dashboard.html")