This repository unlocks the source code for the Cyclist Bike Share data analysis project. Get a behind-the-scenes look at how we analyzed cyclist usage patterns!
Ready to Shift Gears?
Source Code: Dive into the heart of the project by examining the code within this repository. It's your roadmap to understanding how we analyzed cyclist data.
Detailed Documentation: To fully grasp the project's goals, methodology, and results, explore our comprehensive documentation.
Blog Post: Seeking a concise overview of the project's key findings? Visit our blog post for a streamlined explanation.
Visit Our Site: Curious about other data-driven projects? Head over to our website for more!
Got Questions? Feel free to reach out!
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import pandas as pd
# Use glob to find all CSV files in the directory
csv_files = glob.glob(r'C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\*.csv')
# Print the list of CSV files found
print(*csv_files, sep="\n")
# Concatenate all CSV files into a single DataFrame
cycle_data = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)
C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202201-divvy-tripdata.csv.csv C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202202-divvy-tripdata.csv.csv C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202203-divvy-tripdata.csv.csv C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202204-divvy-tripdata.csv C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202205-divvy-tripdata.csv C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202206-divvy-tripdata.csv C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202207-divvy-tripdata.csv C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202208-divvy-tripdata.csv C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202209-divvy-publictripdata.csv C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202210-divvy-tripdata.csv C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202211-divvy-tripdata.csv C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202212-divvy-tripdata.csv
cycle_data.head()
ride_id | rideable_type | started_at | ended_at | start_station_name | start_station_id | end_station_name | end_station_id | start_lat | start_lng | end_lat | end_lng | member_casual | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | C2F7DD78E82EC875 | electric_bike | 2022-01-13 11:59:47 | 2022-01-13 12:02:44 | Glenwood Ave & Touhy Ave | 525 | Clark St & Touhy Ave | RP-007 | 42.012800 | -87.665906 | 42.012560 | -87.674367 | casual |
1 | A6CF8980A652D272 | electric_bike | 2022-01-10 08:41:56 | 2022-01-10 08:46:17 | Glenwood Ave & Touhy Ave | 525 | Clark St & Touhy Ave | RP-007 | 42.012763 | -87.665967 | 42.012560 | -87.674367 | casual |
2 | BD0F91DFF741C66D | classic_bike | 2022-01-25 04:53:40 | 2022-01-25 04:58:01 | Sheffield Ave & Fullerton Ave | TA1306000016 | Greenview Ave & Fullerton Ave | TA1307000001 | 41.925602 | -87.653708 | 41.925330 | -87.665800 | member |
3 | CBB80ED419105406 | classic_bike | 2022-01-04 00:18:04 | 2022-01-04 00:33:00 | Clark St & Bryn Mawr Ave | KA1504000151 | Paulina St & Montrose Ave | TA1309000021 | 41.983593 | -87.669154 | 41.961507 | -87.671387 | casual |
4 | DDC963BFDDA51EEA | classic_bike | 2022-01-20 01:31:10 | 2022-01-20 01:37:12 | Michigan Ave & Jackson Blvd | TA1309000002 | State St & Randolph St | TA1305000029 | 41.877850 | -87.624080 | 41.884621 | -87.627834 | member |
# Use head() to display the first few rows of the DataFrame
print(cycle_data.head())
print(cycle_data.info())
print(cycle_data.describe())
# Check for missing values in each column
print(cycle_data.isnull().sum())
# Check data types of columns
print(cycle_data.dtypes)
ride_id rideable_type started_at ended_at \ 0 C2F7DD78E82EC875 electric_bike 2022-01-13 11:59:47 2022-01-13 12:02:44 1 A6CF8980A652D272 electric_bike 2022-01-10 08:41:56 2022-01-10 08:46:17 2 BD0F91DFF741C66D classic_bike 2022-01-25 04:53:40 2022-01-25 04:58:01 3 CBB80ED419105406 classic_bike 2022-01-04 00:18:04 2022-01-04 00:33:00 4 DDC963BFDDA51EEA classic_bike 2022-01-20 01:31:10 2022-01-20 01:37:12 start_station_name start_station_id \ 0 Glenwood Ave & Touhy Ave 525 1 Glenwood Ave & Touhy Ave 525 2 Sheffield Ave & Fullerton Ave TA1306000016 3 Clark St & Bryn Mawr Ave KA1504000151 4 Michigan Ave & Jackson Blvd TA1309000002 end_station_name end_station_id start_lat start_lng \ 0 Clark St & Touhy Ave RP-007 42.012800 -87.665906 1 Clark St & Touhy Ave RP-007 42.012763 -87.665967 2 Greenview Ave & Fullerton Ave TA1307000001 41.925602 -87.653708 3 Paulina St & Montrose Ave TA1309000021 41.983593 -87.669154 4 State St & Randolph St TA1305000029 41.877850 -87.624080 end_lat end_lng member_casual 0 42.012560 -87.674367 casual 1 42.012560 -87.674367 casual 2 41.925330 -87.665800 member 3 41.961507 -87.671387 casual 4 41.884621 -87.627834 member <class 'pandas.core.frame.DataFrame'> RangeIndex: 5667717 entries, 0 to 5667716 Data columns (total 13 columns): # Column Dtype --- ------ ----- 0 ride_id object 1 rideable_type object 2 started_at object 3 ended_at object 4 start_station_name object 5 start_station_id object 6 end_station_name object 7 end_station_id object 8 start_lat float64 9 start_lng float64 10 end_lat float64 11 end_lng float64 12 member_casual object dtypes: float64(4), object(9) memory usage: 562.1+ MB None start_lat start_lng end_lat end_lng count 5.667717e+06 5.667717e+06 5.661859e+06 5.661859e+06 mean 4.190222e+01 -8.764783e+01 4.190242e+01 -8.764790e+01 std 4.626109e-02 2.999925e-02 6.805821e-02 1.082985e-01 min 4.164000e+01 -8.784000e+01 0.000000e+00 -8.814000e+01 25% 4.188103e+01 -8.766154e+01 4.188103e+01 -8.766260e+01 50% 4.190000e+01 -8.764410e+01 4.190000e+01 -8.764414e+01 75% 4.193000e+01 -8.762957e+01 4.193000e+01 -8.762963e+01 max 4.563503e+01 -7.379648e+01 4.237000e+01 0.000000e+00 ride_id 0 rideable_type 0 started_at 0 ended_at 0 start_station_name 833064 start_station_id 833064 end_station_name 892742 end_station_id 892742 start_lat 0 start_lng 0 end_lat 5858 end_lng 5858 member_casual 0 dtype: int64 ride_id object rideable_type object started_at object ended_at object start_station_name object start_station_id object end_station_name object end_station_id object start_lat float64 start_lng float64 end_lat float64 end_lng float64 member_casual object dtype: object
# Handle missing values - for example, drop rows with missing values
cycle_data.dropna(inplace=True)
cycle_data.columns = cycle_data.columns.str.replace(' ', '_').str.lower()
print(cycle_data.columns)
# Assuming the actual column names are 'started_at' and 'ended_at'
cycle_data['started_at'] = pd.to_datetime(cycle_data['started_at'])
cycle_data['ended_at'] = pd.to_datetime(cycle_data['ended_at'])
Index(['ride_id', 'rideable_type', 'started_at', 'ended_at', 'start_station_name', 'start_station_id', 'end_station_name', 'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng', 'member_casual'], dtype='object')
cycle_data['ride_duration_minutes'] = (cycle_data['ended_at'] - cycle_data['started_at']).dt.seconds / 60
# Perform exploratory data analysis (EDA)
average_ride_duration = (cycle_data['ended_at'] - cycle_data['started_at']).mean()
print("Average ride duration:", average_ride_duration)
Average ride duration: 0 days 00:17:05.710490552
# Count the number of rides for each user type
ride_count_by_user_type = cycle_data['member_casual'].value_counts()
print("Ride count by user type:")
print(ride_count_by_user_type)
Ride count by user type: member 2611171 casual 1758189 Name: member_casual, dtype: int64
# Extract the hour from the 'started_at' timestamp
cycle_data['start_hour'] = cycle_data['started_at'].dt.hour
ride_count_by_hour = cycle_data['start_hour'].value_counts()
print("Peak hours for rides:")
print(ride_count_by_hour)
Peak hours for rides: 17 451135 16 384582 18 373986 15 307606 19 273653 14 263969 13 257552 12 255740 11 219678 8 219256 20 193556 7 181117 10 176532 9 167054 21 156609 22 127787 6 97554 23 87524 0 58535 1 36905 5 34501 2 21266 3 12443 4 10820 Name: start_hour, dtype: int64
import seaborn as sns
import matplotlib.pyplot as plt
# Create a box plot for ride duration by user type
plt.figure(figsize=(10, 6))
sns.boxplot(x='member_casual', y=(cycle_data['ended_at'] - cycle_data['started_at']).dt.seconds / 60,
data=cycle_data, palette='pastel')
plt.xlabel('User Type')
plt.ylabel('Ride Duration (minutes)')
plt.title('Distribution of Ride Durations by User Type')
plt.ylim(0, 150)
plt.show()
import folium
from folium.plugins import HeatMap
from IPython.display import display
# Define the latitude and longitude coordinates for the center of the map
latitude_center = 41.8781 # Example latitude
longitude_center = -87.6298 # Example longitude
# Create a map centered around a specific location which is chiago actual riding points
m = folium.Map(location=[41.8781,-87.6298], zoom_start=12)
start_locations = cycle_data[['start_lat', 'start_lng']].values.tolist()
end_locations = cycle_data[['end_lat', 'end_lng']].values.tolist()
HeatMap(start_locations).add_to(m)
HeatMap(end_locations).add_to(m)
# Save the map to an HTML file
m.save('start_end_heatmap.html')
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
cycle_data['started_at'] = pd.to_datetime(cycle_data['started_at'])
ride_count_by_day = cycle_data.groupby(pd.Grouper(key='started_at', freq='D')).size()
# Create a line plot
plt.figure(figsize=(12, 6))
plt.plot(ride_count_by_day.index, ride_count_by_day.values, color='#4CAF50', linewidth=2, marker='o', markersize=6, markerfacecolor='#FF5722', markeredgecolor='none')
plt.xlabel('Date', fontsize=14, fontweight='bold')
plt.ylabel('Ride Count', fontsize=14, fontweight='bold')
plt.title('Ride Count Over Time (Daily)', fontsize=16, fontweight='bold')
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
plt.gca().xaxis.set_minor_locator(mdates.DayLocator(interval=5))
plt.xticks(rotation=45, ha='right')
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(['Ride Count'], loc='upper left')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
cycle_data['started_at'] = pd.to_datetime(cycle_data['started_at'])
ride_count_by_week = cycle_data.groupby(pd.Grouper(key='started_at', freq='W')).size()
plt.figure(figsize=(12, 6))
plt.plot(ride_count_by_week.index, ride_count_by_week.values, color='#4CAF50', linewidth=2, marker='o', markersize=6, markerfacecolor='#FF5722', markeredgecolor='none')
plt.xlabel('Week', fontsize=14, fontweight='bold')
plt.ylabel('Ride Count', fontsize=14, fontweight='bold')
plt.title('Ride Count Over Time (Weekly)', fontsize=16, fontweight='bold')
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
plt.gca().xaxis.set_minor_locator(mdates.WeekdayLocator())
plt.xticks(rotation=45, ha='right')
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(['Ride Count'], loc='upper left')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
cycle_data['started_at'] = pd.to_datetime(cycle_data['started_at'])
ride_count_by_month = cycle_data.groupby(pd.Grouper(key='started_at', freq='M')).size()
plt.figure(figsize=(12, 6))
plt.plot(ride_count_by_month.index, ride_count_by_month.values, color='#4CAF50', linewidth=2, marker='o', markersize=6, markerfacecolor='#FF5722', markeredgecolor='none')
plt.xlabel('Month', fontsize=14, fontweight='bold')
plt.ylabel('Ride Count', fontsize=14, fontweight='bold')
plt.title('Monthly Ride Count', fontsize=16, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
cycle_data['started_at'] = pd.to_datetime(cycle_data['started_at'])
ride_count_by_day = cycle_data.groupby(pd.Grouper(key='started_at', freq='D')).size()
plt.figure(figsize=(12, 12))
plt.subplot(3, 2, (1, 2))
plt.plot(ride_count_by_day.index, ride_count_by_day.values, color='#4CAF50', linewidth=2, marker='o', markersize=6, markerfacecolor='#FF5722', markeredgecolor='none')
plt.xlabel('Date', fontsize=14, fontweight='bold')
plt.ylabel('Ride Count', fontsize=14, fontweight='bold')
plt.title('Ride Count Over Time (Daily)', fontsize=16, fontweight='bold')
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
plt.gca().xaxis.set_minor_locator(mdates.DayLocator(interval=5))
plt.xticks(rotation=45, ha='right')
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(['Ride Count'], loc='upper left')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.tight_layout()
ride_count_by_week = cycle_data.groupby(pd.Grouper(key='started_at', freq='W')).size()
plt.subplot(3, 2, 3)
plt.plot(ride_count_by_week.index, ride_count_by_week.values, color='#4CAF50', linewidth=2, marker='o', markersize=6, markerfacecolor='#FF5722', markeredgecolor='none')
plt.xlabel('Week', fontsize=14, fontweight='bold')
plt.ylabel('Ride Count', fontsize=14, fontweight='bold')
plt.title('Ride Count Over Time (Weekly)', fontsize=16, fontweight='bold')
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
plt.gca().xaxis.set_minor_locator(mdates.WeekdayLocator())
plt.xticks(rotation=45, ha='right')
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(['Ride Count'], loc='upper left')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.tight_layout()
ride_count_by_month = cycle_data.groupby(pd.Grouper(key='started_at', freq='M')).size()
plt.subplot(3, 2, 4)
plt.plot(ride_count_by_month.index, ride_count_by_month.values, color='#4CAF50', linewidth=2, marker='o', markersize=6, markerfacecolor='#FF5722', markeredgecolor='none')
plt.xlabel('Month', fontsize=14, fontweight='bold')
plt.ylabel('Ride Count', fontsize=14, fontweight='bold')
plt.title('Monthly Ride Count', fontsize=16, fontweight='bold')
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
plt.xticks(rotation=45, ha='right')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
#Let's try fraunces font, to match the visualization to our blog page...
fraunces_font = fm.FontProperties(family='Fraunces')
cycle_data['hour'] = cycle_data['started_at'].dt.hour
ride_count_by_hour = cycle_data.groupby('hour').size()
mean_ride_count = ride_count_by_hour.mean()
max_ride_count = ride_count_by_hour.max()
plt.figure(figsize=(12, 8))
ride_count_by_hour.plot(kind='bar', color='#4F6272', edgecolor='#1C3144', linewidth=1.5, alpha=0.8)
plt.xlabel('Hour of the Day', fontsize=16, fontweight='bold', fontproperties=fraunces_font)
plt.ylabel('Number of Rides', fontsize=16, fontweight='bold', fontproperties=fraunces_font)
plt.title('Peak Ride Hours', fontsize=18, fontweight='bold', fontproperties=fraunces_font)
plt.axhline(y=mean_ride_count, color='red', linestyle='--', linewidth=1.5)
plt.text(0, mean_ride_count + 10, f'Mean: {mean_ride_count:.0f} rides', color='red', fontsize=12, fontweight='bold', fontproperties=fraunces_font)
plt.text(0, max_ride_count + 10, f'Max: {max_ride_count:.0f} rides', color='blue', fontsize=12, fontweight='bold', fontproperties=fraunces_font)
plt.xticks(rotation=0, fontsize=14, fontproperties=fraunces_font)
plt.yticks(fontsize=14, fontproperties=fraunces_font)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(['Ride Count', 'Mean Ride Count'], loc='upper right', fontsize=12, prop=fraunces_font)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.tight_layout()
plt.show()
findfont: Font family ['Fraunces'] not found. Falling back to DejaVu Sans. findfont: Font family ['Fraunces'] not found. Falling back to DejaVu Sans. findfont: Font family ['Fraunces'] not found. Falling back to DejaVu Sans. findfont: Font family ['Fraunces'] not found. Falling back to DejaVu Sans. findfont: Font family ['Fraunces'] not found. Falling back to DejaVu Sans.
pip install geopy
Requirement already satisfied: geopy in c:\users\jatin gagwani\anaconda3\lib\site-packages (2.4.1) Requirement already satisfied: geographiclib<3,>=1.52 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from geopy) (2.0) Note: you may need to restart the kernel to use updated packages.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
cycle_data['start_hour'] = cycle_data['started_at'].dt.hour
cycle_data['day_of_week'] = cycle_data['started_at'].dt.day_name()
pivot_table = cycle_data.pivot_table(index='start_hour', columns='day_of_week', aggfunc='size')
plt.figure(figsize=(12, 6))
sns.heatmap(pivot_table, cmap='coolwarm', linewidths=0.5)
plt.title('Ride Start Time Distribution by Hour and Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Start Hour')
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
plt.figure(figsize=(12, 8)) # Increase figure size for better visualization
sns.histplot(data=cycle_data, x='day_of_week', hue='member_casual', multiple='stack', palette='coolwarm')
plt.title('Ride Frequency by Day of Week', fontsize=20, fontweight='bold', color='navy')
plt.xlabel('Day of Week', fontsize=16, fontweight='bold', color='darkslategray')
plt.ylabel('Frequency', fontsize=16, fontweight='bold', color='darkslategray')
plt.legend(labels=['Casual', 'Member'], title='User Type', fontsize=14, title_fontsize=14, labelcolor=['orange', 'royalblue'])
plt.xticks(rotation=45, fontsize=14, color='dimgray')
plt.yticks(fontsize=14, color='dimgray')
plt.tight_layout()
plt.show()
import pandas as pd
cycle_data['day_of_week'] = cycle_data['started_at'].dt.day_name()
import pandas as pd
import matplotlib.pyplot as plt
cycle_data['month'] = cycle_data['started_at'].dt.month
def get_season(month):
if month in [3, 4, 5]:
return 'Spring'
elif month in [6, 7, 8]:
return 'Summer'
elif month in [9, 10, 11]:
return 'Autumn'
else:
return 'Winter'
cycle_data['season'] = cycle_data['month'].apply(get_season)
ride_count_by_season = cycle_data.groupby('season').size().reset_index(name='ride_count')
colors = ['#FFB6C1', '#ADD8E6', '#90EE90', '#FFD700']
plt.figure(figsize=(10, 6))
bars = plt.bar(ride_count_by_season['season'], ride_count_by_season['ride_count'], color=colors)
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width() / 2, yval + 10, round(yval, 2), va='bottom', ha='center', fontsize=12)
plt.title('Ride Count by Season', fontsize=18, fontweight='bold', color='#333333')
plt.xlabel('Season', fontsize=14, fontweight='bold', color='#666666')
plt.ylabel('Ride Count', fontsize=14, fontweight='bold', color='#666666')
plt.xticks(fontsize=12, color='#666666')
plt.yticks(fontsize=12, color='#666666')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(ride_count_by_season['season'], loc='upper right')
plt.tight_layout()
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np
cycle_data['month'] = cycle_data['started_at'].dt.month
def get_season(month):
if month in [3, 4, 5]:
return 'Spring'
elif month in [6, 7, 8]:
return 'Summer'
elif month in [9, 10, 11]:
return 'Autumn'
else:
return 'Winter'
cycle_data['season'] = cycle_data['month'].apply(get_season)
ride_count_by_season = cycle_data.groupby('season').size().reset_index(name='ride_count')
colors = ['#FF6347', '#FFD700', '#00FF00', '#1E90FF']
cmap = mcolors.LinearSegmentedColormap.from_list("mycmap", colors)
plt.figure(figsize=(10, 8))
wedges, texts, autotexts = plt.pie(ride_count_by_season['ride_count'], labels=ride_count_by_season['season'],
autopct='%1.1f%%', startangle=140, colors=cmap(np.linspace(0, 1, len(colors))),
wedgeprops=dict(width=0.5, edgecolor='w'), shadow=True)
plt.setp(autotexts, size=12, weight="bold", color="white")
plt.title('Distribution of Rides by Season', fontsize=20, fontweight='bold', color='#333333', pad=20)
plt.axis('equal')
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
ride_count_by_type = cycle_data.groupby(['rideable_type', 'member_casual']).size().unstack(fill_value=0)
num_rideable_types = len(ride_count_by_type)
num_rows = 1
num_cols = num_rideable_types
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 6))
for i, (rideable_type, counts) in enumerate(ride_count_by_type.iterrows()):
ax = axes[i] if num_rideable_types > 1 else axes
ax.pie(counts, labels=ride_count_by_type.columns, autopct='%1.1f%%', startangle=90, colors=colors,
wedgeprops=dict(width=0.3), textprops=dict(fontsize=12))
ax.set_title(f'Distribution of Rides by User Type ({rideable_type})', fontsize=14, pad=20)
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
ride_count_by_user_type = cycle_data['member_casual'].value_counts()
avg_duration_by_user_type = cycle_data.groupby('member_casual')['ride_duration_minutes'].mean()
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
axes[0].bar(ride_count_by_user_type.index, ride_count_by_user_type.values, color=['blue', 'orange'])
axes[0].set_xlabel('User Type', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Ride Count', fontsize=12, fontweight='bold')
axes[0].set_title('Ride Frequency by User Type', fontsize=14, fontweight='bold')
for i, count in enumerate(ride_count_by_user_type.values):
axes[0].text(i, count + 50, str(count), ha='center', va='bottom', fontsize=10)
axes[1].bar(avg_duration_by_user_type.index, avg_duration_by_user_type.values, color=['blue', 'orange'])
axes[1].set_xlabel('User Type', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Average Ride Duration (minutes)', fontsize=12, fontweight='bold')
axes[1].set_title('Average Ride Duration by User Type', fontsize=14, fontweight='bold')
for i, duration in enumerate(avg_duration_by_user_type.values):
axes[1].text(i, duration + 5, f'{duration:.2f}', ha='center', va='bottom', fontsize=10)
plt.tight_layout()
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
ride_count_by_day_casual = cycle_data[cycle_data['member_casual'] == 'casual'].groupby(cycle_data['started_at'].dt.date).size()
ride_count_by_day_member = cycle_data[cycle_data['member_casual'] == 'member'].groupby(cycle_data['started_at'].dt.date).size()
plt.figure(figsize=(10, 6))
plt.plot(ride_count_by_day_casual.index, ride_count_by_day_casual.values, label='Casual', color='orange')
plt.plot(ride_count_by_day_member.index, ride_count_by_day_member.values, label='Member', color='royalblue')
plt.title("Ride Frequency Over Time")
plt.xlabel("Month")
plt.ylabel("Ride Count")
plt.legend()
plt.grid(True)
plt.show()
ride_count_by_user_type = cycle_data['member_casual'].value_counts()
avg_duration_by_user_type = cycle_data.groupby('member_casual')['ride_duration_minutes'].mean()
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
axes[0].bar(ride_count_by_user_type.index, ride_count_by_user_type.values, color=['blue', 'orange'])
axes[0].set_xlabel('User Type', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Ride Count', fontsize=12, fontweight='bold')
axes[0].set_title('Ride Frequency by User Type', fontsize=14, fontweight='bold')
for i, count in enumerate(ride_count_by_user_type.values):
axes[0].text(i, count + 50, str(count), ha='center', va='bottom', fontsize=10)
axes[1].bar(avg_duration_by_user_type.index, avg_duration_by_user_type.values, color=['blue', 'orange'])
axes[1].set_xlabel('User Type', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Average Ride Duration (minutes)', fontsize=12, fontweight='bold')
axes[1].set_title('Average Ride Duration by User Type', fontsize=14, fontweight='bold')
for i, duration in enumerate(avg_duration_by_user_type.values):
axes[1].text(i, duration + 5, f'{duration:.2f}', ha='center', va='bottom', fontsize=10)
plt.tight_layout()
plt.show()
ride_count_by_type = cycle_data.groupby(['rideable_type', 'member_casual']).size().unstack(fill_value=0)
num_rideable_types = len(ride_count_by_type)
num_rows = 1
num_cols = num_rideable_types
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 6))
for i, (rideable_type, counts) in enumerate(ride_count_by_type.iterrows()):
ax = axes[i] if num_rideable_types > 1 else axes
ax.pie(counts, labels=ride_count_by_type.columns, autopct='%1.1f%%', startangle=90,
wedgeprops=dict(width=0.3), textprops=dict(fontsize=12))
ax.set_title(f'Distribution of Rides by User Type ({rideable_type})', fontsize=14, pad=20)
plt.tight_layout()
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
colors = ['#1f77b4', '#ff7f0e']
ride_count_by_day_casual = cycle_data[cycle_data['member_casual'] == 'casual'].groupby(cycle_data['started_at'].dt.date).size()
ride_count_by_day_member = cycle_data[cycle_data['member_casual'] == 'member'].groupby(cycle_data['started_at'].dt.date).size()
plt.figure(figsize=(10, 6))
plt.plot(ride_count_by_day_casual.index, ride_count_by_day_casual.values, label='Casual', color=colors[0])
plt.plot(ride_count_by_day_member.index, ride_count_by_day_member.values, label='Member', color=colors[1])
plt.title("Ride Frequency Over Time")
plt.xlabel("Month")
plt.ylabel("Ride Count")
plt.legend()
plt.grid(True)
plt.show()
ride_count_by_user_type = cycle_data['member_casual'].value_counts()
avg_duration_by_user_type = cycle_data.groupby('member_casual')['ride_duration_minutes'].mean()
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
axes[0].bar(ride_count_by_user_type.index, ride_count_by_user_type.values, color=colors)
axes[0].set_xlabel('User Type', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Ride Count', fontsize=12, fontweight='bold')
axes[0].set_title('Ride Frequency by User Type', fontsize=14, fontweight='bold')
for i, count in enumerate(ride_count_by_user_type.values):
axes[0].text(i, count + 50, str(count), ha='center', va='bottom', fontsize=10)
axes[1].bar(avg_duration_by_user_type.index, avg_duration_by_user_type.values, color=colors)
axes[1].set_xlabel('User Type', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Average Ride Duration (minutes)', fontsize=12, fontweight='bold')
axes[1].set_title('Average Ride Duration by User Type', fontsize=14, fontweight='bold')
for i, duration in enumerate(avg_duration_by_user_type.values):
axes[1].text(i, duration + 5, f'{duration:.2f}', ha='center', va='bottom', fontsize=10)
plt.tight_layout()
plt.show()
ride_count_by_type = cycle_data.groupby(['rideable_type', 'member_casual']).size().unstack(fill_value=0)
num_rideable_types = len(ride_count_by_type)
num_rows = 1
num_cols = num_rideable_types
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 6))
for i, (rideable_type, counts) in enumerate(ride_count_by_type.iterrows()):
ax = axes[i] if num_rideable_types > 1 else axes
ax.pie(counts, labels=ride_count_by_type.columns, autopct='%1.1f%%', startangle=90,
wedgeprops=dict(width=0.3), textprops=dict(fontsize=12), colors=colors)
ax.set_title(f'Distribution of Rides by User Type ({rideable_type})', fontsize=14, pad=20)
plt.tight_layout()
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
colors = ['#1f77b4', '#ff7f0e']
ride_count_by_day_casual = cycle_data[cycle_data['member_casual'] == 'casual'].groupby(cycle_data['started_at'].dt.date).size()
ride_count_by_day_member = cycle_data[cycle_data['member_casual'] == 'member'].groupby(cycle_data['started_at'].dt.date).size()
plt.figure(figsize=(10, 6))
plt.plot(ride_count_by_day_casual.index, ride_count_by_day_casual.values, label='Casual', color=colors[0])
plt.plot(ride_count_by_day_member.index, ride_count_by_day_member.values, label='Member', color=colors[1])
plt.title("Ride Frequency Over Time")
plt.xlabel("Month")
plt.ylabel("Ride Count")
plt.legend()
plt.grid(True)
plt.show()
ride_count_by_user_type = cycle_data['member_casual'].value_counts()
avg_duration_by_user_type = cycle_data.groupby('member_casual')['ride_duration_minutes'].mean()
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
axes[0].bar(ride_count_by_user_type.index, ride_count_by_user_type.values, color=colors)
axes[0].set_xlabel('User Type', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Ride Count', fontsize=12, fontweight='bold')
axes[0].set_title('Ride Frequency by User Type', fontsize=14, fontweight='bold')
for i, count in enumerate(ride_count_by_user_type.values):
axes[0].text(i, count + 50, str(count), ha='center', va='bottom', fontsize=10)
axes[1].bar(avg_duration_by_user_type.index, avg_duration_by_user_type.values, color=colors)
axes[1].set_xlabel('User Type', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Average Ride Duration (minutes)', fontsize=12, fontweight='bold')
axes[1].set_title('Average Ride Duration by User Type', fontsize=14, fontweight='bold')
for i, duration in enumerate(avg_duration_by_user_type.values):
axes[1].text(i, duration + 5, f'{duration:.2f}', ha='center', va='bottom', fontsize=10)
plt.tight_layout()
plt.show()
ride_count_by_type = cycle_data.groupby(['rideable_type', 'member_casual']).size().unstack(fill_value=0)
num_rideable_types = len(ride_count_by_type)
num_rows = 1
num_cols = num_rideable_types
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 6))
for i, (rideable_type, counts) in enumerate(ride_count_by_type.iterrows()):
ax = axes[i] if num_rideable_types > 1 else axes
ax.pie(counts, labels=ride_count_by_type.columns, autopct='%1.1f%%', startangle=90,
wedgeprops=dict(width=0.3), textprops=dict(fontsize=12), colors=colors)
ax.set_title(f'Distribution of Rides by User Type ({rideable_type})', fontsize=14, pad=20)
plt.tight_layout()
plt.figure(figsize=(12, 8))
sns.histplot(data=cycle_data, x='day_of_week', hue='member_casual', multiple='stack', palette='coolwarm')
plt.title('Ride Frequency by Day of Week', fontsize=20, fontweight='bold', color='navy')
plt.xlabel('Day of Week', fontsize=16, fontweight='bold', color='darkslategray')
plt.ylabel('Frequency', fontsize=16, fontweight='bold', color='darkslategray')
plt.legend(labels=['Casual', 'Member'], title='User Type', fontsize=14, title_fontsize=14, labelcolor=['orange', 'royalblue'])
plt.xticks(rotation=45, fontsize=14, color='dimgray')
plt.yticks(fontsize=14, color='dimgray')
plt.tight_layout()
plt.show()
pip install dash
Requirement already satisfied: dash in c:\users\jatin gagwani\anaconda3\lib\site-packages (2.16.1)Note: you may need to restart the kernel to use updated packages. Requirement already satisfied: Flask<3.1,>=1.0.4 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (1.1.2) Requirement already satisfied: plotly>=5.0.0 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (5.9.0) Requirement already satisfied: retrying in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (1.3.4) Requirement already satisfied: typing-extensions>=4.1.1 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (4.3.0) Requirement already satisfied: setuptools in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (63.4.1) Requirement already satisfied: dash-table==5.0.0 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (5.0.0) Requirement already satisfied: Werkzeug<3.1 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (2.0.3) Requirement already satisfied: importlib-metadata in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (4.11.3) Requirement already satisfied: dash-core-components==2.0.0 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (2.0.0) Requirement already satisfied: nest-asyncio in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (1.5.5) Requirement already satisfied: requests in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (2.28.1) Requirement already satisfied: dash-html-components==2.0.0 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (2.0.0) Requirement already satisfied: itsdangerous>=0.24 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from Flask<3.1,>=1.0.4->dash) (2.0.1) Requirement already satisfied: click>=5.1 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from Flask<3.1,>=1.0.4->dash) (8.0.4) Requirement already satisfied: Jinja2>=2.10.1 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from Flask<3.1,>=1.0.4->dash) (3.1.3) Requirement already satisfied: tenacity>=6.2.0 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from plotly>=5.0.0->dash) (8.0.1) Requirement already satisfied: zipp>=0.5 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from importlib-metadata->dash) (3.8.0) Requirement already satisfied: charset-normalizer<3,>=2 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from requests->dash) (2.0.4) Requirement already satisfied: idna<4,>=2.5 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from requests->dash) (3.3) Requirement already satisfied: certifi>=2017.4.17 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from requests->dash) (2024.2.2) Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from requests->dash) (1.26.11) Requirement already satisfied: six>=1.7.0 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from retrying->dash) (1.16.0) Requirement already satisfied: colorama in c:\users\jatin gagwani\anaconda3\lib\site-packages (from click>=5.1->Flask<3.1,>=1.0.4->dash) (0.4.5) Requirement already satisfied: MarkupSafe>=2.0 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from Jinja2>=2.10.1->Flask<3.1,>=1.0.4->dash) (2.0.1)
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
fig = make_subplots(rows=3, cols=2, subplot_titles=("Ride Frequency by User Type",
"Average Ride Duration by User Type",
"Popular Starting Locations",
"Popular Ending Locations",
"Ride Duration Distribution by User Type",
"Ride Frequency Over Time"))
ride_count_by_user_type = cycle_data['member_casual'].value_counts()
fig.add_trace(go.Bar(x=ride_count_by_user_type.index, y=ride_count_by_user_type.values,
marker_color=['blue', 'orange']), row=1, col=1)
fig.update_xaxes(title_text="User Type", row=1, col=1)
fig.update_yaxes(title_text="Ride Count", row=1, col=1)
avg_duration_by_user_type = cycle_data.groupby('member_casual')['ride_duration_minutes'].mean()
fig.add_trace(go.Bar(x=avg_duration_by_user_type.index, y=avg_duration_by_user_type.values,
marker_color=['blue', 'orange']), row=1, col=2)
fig.update_xaxes(title_text="User Type", row=1, col=2)
fig.update_yaxes(title_text="Average Ride Duration (minutes)", row=1, col=2)
starting_locations = cycle_data['start_station_name'].value_counts().nlargest(5)
fig.add_trace(go.Bar(x=starting_locations.index, y=starting_locations.values,
marker_color='green'), row=2, col=1)
fig.update_xaxes(title_text="Start Station", row=2, col=1)
fig.update_yaxes(title_text="Frequency", row=2, col=1)
ending_locations = cycle_data['end_station_name'].value_counts().nlargest(5)
fig.add_trace(go.Bar(x=ending_locations.index, y=ending_locations.values,
marker_color='green'), row=2, col=2)
fig.update_xaxes(title_text="End Station", row=2, col=2)
fig.update_yaxes(title_text="Frequency", row=2, col=2)
fig.add_trace(go.Histogram(x=cycle_data[cycle_data['member_casual'] == 'member']['ride_duration_minutes'],
name='Member', marker_color='blue', opacity=0.7), row=3, col=1)
fig.add_trace(go.Histogram(x=cycle_data[cycle_data['member_casual'] == 'casual']['ride_duration_minutes'],
name='Casual', marker_color='orange', opacity=0.7), row=3, col=1)
fig.update_xaxes(title_text="Ride Duration (minutes)", row=3, col=1)
fig.update_yaxes(title_text="Frequency", row=3, col=1)
ride_count_by_day = cycle_data.groupby(cycle_data['started_at'].dt.date).size()
fig.add_trace(go.Scatter(x=ride_count_by_day.index, y=ride_count_by_day.values,
mode='lines', marker_color='purple'), row=3, col=2)
fig.update_xaxes(title_text="Date", row=3, col=2)
fig.update_yaxes(title_text="Ride Count", row=3, col=2)
fig.update_layout(title_text="Combined Dashboard")
fig.write_html("combined_dashboard.html")