import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


import glob
import pandas as pd

# Use glob to find all CSV files in the directory
csv_files = glob.glob(r'C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\*.csv')

# Print the list of CSV files found
print(*csv_files, sep="\n")

# Concatenate all CSV files into a single DataFrame
cycle_data = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)

C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202201-divvy-tripdata.csv.csv
C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202202-divvy-tripdata.csv.csv
C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202203-divvy-tripdata.csv.csv
C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202204-divvy-tripdata.csv
C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202205-divvy-tripdata.csv
C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202206-divvy-tripdata.csv
C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202207-divvy-tripdata.csv
C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202208-divvy-tripdata.csv
C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202209-divvy-publictripdata.csv
C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202210-divvy-tripdata.csv
C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202211-divvy-tripdata.csv
C:\Users\Jatin Gagwani\Documents\Coursera_DA\Cycle\202212-divvy-tripdata.csv


cycle_data.head()


# Use head() to display the first few rows of the DataFrame
print(cycle_data.head())

print(cycle_data.info())

print(cycle_data.describe())

# Check for missing values in each column
print(cycle_data.isnull().sum())

# Check data types of columns
print(cycle_data.dtypes)

            ride_id  rideable_type           started_at             ended_at  \
0  C2F7DD78E82EC875  electric_bike  2022-01-13 11:59:47  2022-01-13 12:02:44   
1  A6CF8980A652D272  electric_bike  2022-01-10 08:41:56  2022-01-10 08:46:17   
2  BD0F91DFF741C66D   classic_bike  2022-01-25 04:53:40  2022-01-25 04:58:01   
3  CBB80ED419105406   classic_bike  2022-01-04 00:18:04  2022-01-04 00:33:00   
4  DDC963BFDDA51EEA   classic_bike  2022-01-20 01:31:10  2022-01-20 01:37:12   

              start_station_name start_station_id  \
0       Glenwood Ave & Touhy Ave              525   
1       Glenwood Ave & Touhy Ave              525   
2  Sheffield Ave & Fullerton Ave     TA1306000016   
3       Clark St & Bryn Mawr Ave     KA1504000151   
4    Michigan Ave & Jackson Blvd     TA1309000002   

                end_station_name end_station_id  start_lat  start_lng  \
0           Clark St & Touhy Ave         RP-007  42.012800 -87.665906   
1           Clark St & Touhy Ave         RP-007  42.012763 -87.665967   
2  Greenview Ave & Fullerton Ave   TA1307000001  41.925602 -87.653708   
3      Paulina St & Montrose Ave   TA1309000021  41.983593 -87.669154   
4         State St & Randolph St   TA1305000029  41.877850 -87.624080   

     end_lat    end_lng member_casual  
0  42.012560 -87.674367        casual  
1  42.012560 -87.674367        casual  
2  41.925330 -87.665800        member  
3  41.961507 -87.671387        casual  
4  41.884621 -87.627834        member  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5667717 entries, 0 to 5667716
Data columns (total 13 columns):
 #   Column              Dtype  
---  ------              -----  
 0   ride_id             object 
 1   rideable_type       object 
 2   started_at          object 
 3   ended_at            object 
 4   start_station_name  object 
 5   start_station_id    object 
 6   end_station_name    object 
 7   end_station_id      object 
 8   start_lat           float64
 9   start_lng           float64
 10  end_lat             float64
 11  end_lng             float64
 12  member_casual       object 
dtypes: float64(4), object(9)
memory usage: 562.1+ MB
None
          start_lat     start_lng       end_lat       end_lng
count  5.667717e+06  5.667717e+06  5.661859e+06  5.661859e+06
mean   4.190222e+01 -8.764783e+01  4.190242e+01 -8.764790e+01
std    4.626109e-02  2.999925e-02  6.805821e-02  1.082985e-01
min    4.164000e+01 -8.784000e+01  0.000000e+00 -8.814000e+01
25%    4.188103e+01 -8.766154e+01  4.188103e+01 -8.766260e+01
50%    4.190000e+01 -8.764410e+01  4.190000e+01 -8.764414e+01
75%    4.193000e+01 -8.762957e+01  4.193000e+01 -8.762963e+01
max    4.563503e+01 -7.379648e+01  4.237000e+01  0.000000e+00
ride_id                    0
rideable_type              0
started_at                 0
ended_at                   0
start_station_name    833064
start_station_id      833064
end_station_name      892742
end_station_id        892742
start_lat                  0
start_lng                  0
end_lat                 5858
end_lng                 5858
member_casual              0
dtype: int64
ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object


# Handle missing values - for example, drop rows with missing values
cycle_data.dropna(inplace=True)

cycle_data.columns = cycle_data.columns.str.replace(' ', '_').str.lower()

print(cycle_data.columns)

# Assuming the actual column names are 'started_at' and 'ended_at'
cycle_data['started_at'] = pd.to_datetime(cycle_data['started_at'])
cycle_data['ended_at'] = pd.to_datetime(cycle_data['ended_at'])

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual'],
      dtype='object')


cycle_data['ride_duration_minutes'] = (cycle_data['ended_at'] - cycle_data['started_at']).dt.seconds / 60


# Perform exploratory data analysis (EDA)
average_ride_duration = (cycle_data['ended_at'] - cycle_data['started_at']).mean()
print("Average ride duration:", average_ride_duration)

Average ride duration: 0 days 00:17:05.710490552


# Count the number of rides for each user type
ride_count_by_user_type = cycle_data['member_casual'].value_counts()

print("Ride count by user type:")
print(ride_count_by_user_type)

Ride count by user type:
member    2611171
casual    1758189
Name: member_casual, dtype: int64


# Extract the hour from the 'started_at' timestamp
cycle_data['start_hour'] = cycle_data['started_at'].dt.hour

ride_count_by_hour = cycle_data['start_hour'].value_counts()

print("Peak hours for rides:")
print(ride_count_by_hour)

Peak hours for rides:
17    451135
16    384582
18    373986
15    307606
19    273653
14    263969
13    257552
12    255740
11    219678
8     219256
20    193556
7     181117
10    176532
9     167054
21    156609
22    127787
6      97554
23     87524
0      58535
1      36905
5      34501
2      21266
3      12443
4      10820
Name: start_hour, dtype: int64


import seaborn as sns
import matplotlib.pyplot as plt

# Create a box plot for ride duration by user type
plt.figure(figsize=(10, 6))
sns.boxplot(x='member_casual', y=(cycle_data['ended_at'] - cycle_data['started_at']).dt.seconds / 60, 
            data=cycle_data, palette='pastel')
plt.xlabel('User Type')
plt.ylabel('Ride Duration (minutes)')
plt.title('Distribution of Ride Durations by User Type')
plt.ylim(0, 150)  
plt.show()


import folium
from folium.plugins import HeatMap
from IPython.display import display

# Define the latitude and longitude coordinates for the center of the map
latitude_center = 41.8781  # Example latitude
longitude_center = -87.6298  # Example longitude


# Create a map centered around a specific location which is chiago actual riding points
m = folium.Map(location=[41.8781,-87.6298], zoom_start=12)

start_locations = cycle_data[['start_lat', 'start_lng']].values.tolist()

end_locations = cycle_data[['end_lat', 'end_lng']].values.tolist()

HeatMap(start_locations).add_to(m)
HeatMap(end_locations).add_to(m)

# Save the map to an HTML file
m.save('start_end_heatmap.html')


import matplotlib.pyplot as plt
import matplotlib.dates as mdates

cycle_data['started_at'] = pd.to_datetime(cycle_data['started_at'])

ride_count_by_day = cycle_data.groupby(pd.Grouper(key='started_at', freq='D')).size()

# Create a line plot
plt.figure(figsize=(12, 6))
plt.plot(ride_count_by_day.index, ride_count_by_day.values, color='#4CAF50', linewidth=2, marker='o', markersize=6, markerfacecolor='#FF5722', markeredgecolor='none')

plt.xlabel('Date', fontsize=14, fontweight='bold')
plt.ylabel('Ride Count', fontsize=14, fontweight='bold')
plt.title('Ride Count Over Time (Daily)', fontsize=16, fontweight='bold')

plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
plt.gca().xaxis.set_minor_locator(mdates.DayLocator(interval=5))

plt.xticks(rotation=45, ha='right')

plt.grid(True, linestyle='--', alpha=0.7)

plt.legend(['Ride Count'], loc='upper left')

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.tight_layout()

plt.show()


import matplotlib.pyplot as plt
import matplotlib.dates as mdates

cycle_data['started_at'] = pd.to_datetime(cycle_data['started_at'])

ride_count_by_week = cycle_data.groupby(pd.Grouper(key='started_at', freq='W')).size()

plt.figure(figsize=(12, 6))
plt.plot(ride_count_by_week.index, ride_count_by_week.values, color='#4CAF50', linewidth=2, marker='o', markersize=6, markerfacecolor='#FF5722', markeredgecolor='none')

plt.xlabel('Week', fontsize=14, fontweight='bold')
plt.ylabel('Ride Count', fontsize=14, fontweight='bold')
plt.title('Ride Count Over Time (Weekly)', fontsize=16, fontweight='bold')

plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
plt.gca().xaxis.set_minor_locator(mdates.WeekdayLocator())

plt.xticks(rotation=45, ha='right')

plt.grid(True, linestyle='--', alpha=0.7)

plt.legend(['Ride Count'], loc='upper left')

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.tight_layout()

plt.show()


import matplotlib.pyplot as plt
import matplotlib.dates as mdates

cycle_data['started_at'] = pd.to_datetime(cycle_data['started_at'])

ride_count_by_month = cycle_data.groupby(pd.Grouper(key='started_at', freq='M')).size()

plt.figure(figsize=(12, 6))
plt.plot(ride_count_by_month.index, ride_count_by_month.values, color='#4CAF50', linewidth=2, marker='o', markersize=6, markerfacecolor='#FF5722', markeredgecolor='none')

plt.xlabel('Month', fontsize=14, fontweight='bold')
plt.ylabel('Ride Count', fontsize=14, fontweight='bold')
plt.title('Monthly Ride Count', fontsize=16, fontweight='bold')

plt.xticks(rotation=45, ha='right')

plt.grid(True, linestyle='--', alpha=0.7)

plt.tight_layout()

plt.show()


import matplotlib.pyplot as plt
import matplotlib.dates as mdates

cycle_data['started_at'] = pd.to_datetime(cycle_data['started_at'])

ride_count_by_day = cycle_data.groupby(pd.Grouper(key='started_at', freq='D')).size()

plt.figure(figsize=(12, 12))
plt.subplot(3, 2, (1, 2))
plt.plot(ride_count_by_day.index, ride_count_by_day.values, color='#4CAF50', linewidth=2, marker='o', markersize=6, markerfacecolor='#FF5722', markeredgecolor='none')
plt.xlabel('Date', fontsize=14, fontweight='bold')
plt.ylabel('Ride Count', fontsize=14, fontweight='bold')
plt.title('Ride Count Over Time (Daily)', fontsize=16, fontweight='bold')
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
plt.gca().xaxis.set_minor_locator(mdates.DayLocator(interval=5))
plt.xticks(rotation=45, ha='right')
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(['Ride Count'], loc='upper left')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.tight_layout()

ride_count_by_week = cycle_data.groupby(pd.Grouper(key='started_at', freq='W')).size()

plt.subplot(3, 2, 3)
plt.plot(ride_count_by_week.index, ride_count_by_week.values, color='#4CAF50', linewidth=2, marker='o', markersize=6, markerfacecolor='#FF5722', markeredgecolor='none')
plt.xlabel('Week', fontsize=14, fontweight='bold')
plt.ylabel('Ride Count', fontsize=14, fontweight='bold')
plt.title('Ride Count Over Time (Weekly)', fontsize=16, fontweight='bold')
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
plt.gca().xaxis.set_minor_locator(mdates.WeekdayLocator())
plt.xticks(rotation=45, ha='right')
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(['Ride Count'], loc='upper left')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.tight_layout()

ride_count_by_month = cycle_data.groupby(pd.Grouper(key='started_at', freq='M')).size()

plt.subplot(3, 2, 4)
plt.plot(ride_count_by_month.index, ride_count_by_month.values, color='#4CAF50', linewidth=2, marker='o', markersize=6, markerfacecolor='#FF5722', markeredgecolor='none')
plt.xlabel('Month', fontsize=14, fontweight='bold')
plt.ylabel('Ride Count', fontsize=14, fontweight='bold')
plt.title('Monthly Ride Count', fontsize=16, fontweight='bold')
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
plt.xticks(rotation=45, ha='right')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()

plt.show()


import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

#Let's try fraunces font, to match the visualization to our blog page...
fraunces_font = fm.FontProperties(family='Fraunces')

cycle_data['hour'] = cycle_data['started_at'].dt.hour

ride_count_by_hour = cycle_data.groupby('hour').size()

mean_ride_count = ride_count_by_hour.mean()
max_ride_count = ride_count_by_hour.max()

plt.figure(figsize=(12, 8))
ride_count_by_hour.plot(kind='bar', color='#4F6272', edgecolor='#1C3144', linewidth=1.5, alpha=0.8) 

plt.xlabel('Hour of the Day', fontsize=16, fontweight='bold', fontproperties=fraunces_font)
plt.ylabel('Number of Rides', fontsize=16, fontweight='bold', fontproperties=fraunces_font)
plt.title('Peak Ride Hours', fontsize=18, fontweight='bold', fontproperties=fraunces_font)

plt.axhline(y=mean_ride_count, color='red', linestyle='--', linewidth=1.5)
plt.text(0, mean_ride_count + 10, f'Mean: {mean_ride_count:.0f} rides', color='red', fontsize=12, fontweight='bold', fontproperties=fraunces_font)
plt.text(0, max_ride_count + 10, f'Max: {max_ride_count:.0f} rides', color='blue', fontsize=12, fontweight='bold', fontproperties=fraunces_font)

plt.xticks(rotation=0, fontsize=14, fontproperties=fraunces_font)
plt.yticks(fontsize=14, fontproperties=fraunces_font)
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.legend(['Ride Count', 'Mean Ride Count'], loc='upper right', fontsize=12, prop=fraunces_font)

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.tight_layout()

plt.show()

findfont: Font family ['Fraunces'] not found. Falling back to DejaVu Sans.
findfont: Font family ['Fraunces'] not found. Falling back to DejaVu Sans.
findfont: Font family ['Fraunces'] not found. Falling back to DejaVu Sans.
findfont: Font family ['Fraunces'] not found. Falling back to DejaVu Sans.
findfont: Font family ['Fraunces'] not found. Falling back to DejaVu Sans.


pip install geopy

Requirement already satisfied: geopy in c:\users\jatin gagwani\anaconda3\lib\site-packages (2.4.1)
Requirement already satisfied: geographiclib<3,>=1.52 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from geopy) (2.0)
Note: you may need to restart the kernel to use updated packages.


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

cycle_data['start_hour'] = cycle_data['started_at'].dt.hour
cycle_data['day_of_week'] = cycle_data['started_at'].dt.day_name()

pivot_table = cycle_data.pivot_table(index='start_hour', columns='day_of_week', aggfunc='size')

plt.figure(figsize=(12, 6))
sns.heatmap(pivot_table, cmap='coolwarm', linewidths=0.5)
plt.title('Ride Start Time Distribution by Hour and Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Start Hour')
plt.show()


import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

plt.figure(figsize=(12, 8))  # Increase figure size for better visualization
sns.histplot(data=cycle_data, x='day_of_week', hue='member_casual', multiple='stack', palette='coolwarm')  
plt.title('Ride Frequency by Day of Week', fontsize=20, fontweight='bold', color='navy')  
plt.xlabel('Day of Week', fontsize=16, fontweight='bold', color='darkslategray')  
plt.ylabel('Frequency', fontsize=16, fontweight='bold', color='darkslategray')  
plt.legend(labels=['Casual', 'Member'], title='User Type', fontsize=14, title_fontsize=14, labelcolor=['orange', 'royalblue'])
plt.xticks(rotation=45, fontsize=14, color='dimgray')  
plt.yticks(fontsize=14, color='dimgray')  
plt.tight_layout()  
plt.show()


import pandas as pd
cycle_data['day_of_week'] = cycle_data['started_at'].dt.day_name()


import pandas as pd
import matplotlib.pyplot as plt

cycle_data['month'] = cycle_data['started_at'].dt.month

def get_season(month):
    if month in [3, 4, 5]:  
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Autumn'
    else:  
        return 'Winter'

cycle_data['season'] = cycle_data['month'].apply(get_season)

ride_count_by_season = cycle_data.groupby('season').size().reset_index(name='ride_count')

colors = ['#FFB6C1', '#ADD8E6', '#90EE90', '#FFD700']

plt.figure(figsize=(10, 6))
bars = plt.bar(ride_count_by_season['season'], ride_count_by_season['ride_count'], color=colors)

for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, yval + 10, round(yval, 2), va='bottom', ha='center', fontsize=12)

plt.title('Ride Count by Season', fontsize=18, fontweight='bold', color='#333333')
plt.xlabel('Season', fontsize=14, fontweight='bold', color='#666666')
plt.ylabel('Ride Count', fontsize=14, fontweight='bold', color='#666666')
plt.xticks(fontsize=12, color='#666666')
plt.yticks(fontsize=12, color='#666666')
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.legend(ride_count_by_season['season'], loc='upper right')

plt.tight_layout()
plt.show()


import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np

cycle_data['month'] = cycle_data['started_at'].dt.month

def get_season(month):
    if month in [3, 4, 5]:  
        return 'Spring'
    elif month in [6, 7, 8]:  
        return 'Summer'
    elif month in [9, 10, 11]:  
        return 'Autumn'
    else:  
        return 'Winter'

cycle_data['season'] = cycle_data['month'].apply(get_season)

ride_count_by_season = cycle_data.groupby('season').size().reset_index(name='ride_count')

colors = ['#FF6347', '#FFD700', '#00FF00', '#1E90FF']
cmap = mcolors.LinearSegmentedColormap.from_list("mycmap", colors)


plt.figure(figsize=(10, 8))
wedges, texts, autotexts = plt.pie(ride_count_by_season['ride_count'], labels=ride_count_by_season['season'],
                                    autopct='%1.1f%%', startangle=140, colors=cmap(np.linspace(0, 1, len(colors))),
                                    wedgeprops=dict(width=0.5, edgecolor='w'), shadow=True)

plt.setp(autotexts, size=12, weight="bold", color="white")

plt.title('Distribution of Rides by Season', fontsize=20, fontweight='bold', color='#333333', pad=20)

plt.axis('equal') 

plt.tight_layout()
plt.show()


import matplotlib.pyplot as plt

ride_count_by_type = cycle_data.groupby(['rideable_type', 'member_casual']).size().unstack(fill_value=0)

num_rideable_types = len(ride_count_by_type)

num_rows = 1
num_cols = num_rideable_types

fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 6))

for i, (rideable_type, counts) in enumerate(ride_count_by_type.iterrows()):
    ax = axes[i] if num_rideable_types > 1 else axes
    ax.pie(counts, labels=ride_count_by_type.columns, autopct='%1.1f%%', startangle=90, colors=colors,
           wedgeprops=dict(width=0.3), textprops=dict(fontsize=12))
    ax.set_title(f'Distribution of Rides by User Type ({rideable_type})', fontsize=14, pad=20)

plt.tight_layout()
plt.show()


import matplotlib.pyplot as plt

ride_count_by_user_type = cycle_data['member_casual'].value_counts()

avg_duration_by_user_type = cycle_data.groupby('member_casual')['ride_duration_minutes'].mean()

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

axes[0].bar(ride_count_by_user_type.index, ride_count_by_user_type.values, color=['blue', 'orange'])
axes[0].set_xlabel('User Type', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Ride Count', fontsize=12, fontweight='bold')
axes[0].set_title('Ride Frequency by User Type', fontsize=14, fontweight='bold')

for i, count in enumerate(ride_count_by_user_type.values):
    axes[0].text(i, count + 50, str(count), ha='center', va='bottom', fontsize=10)

axes[1].bar(avg_duration_by_user_type.index, avg_duration_by_user_type.values, color=['blue', 'orange'])
axes[1].set_xlabel('User Type', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Average Ride Duration (minutes)', fontsize=12, fontweight='bold')
axes[1].set_title('Average Ride Duration by User Type', fontsize=14, fontweight='bold')

for i, duration in enumerate(avg_duration_by_user_type.values):
    axes[1].text(i, duration + 5, f'{duration:.2f}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()


import pandas as pd
import matplotlib.pyplot as plt

ride_count_by_day_casual = cycle_data[cycle_data['member_casual'] == 'casual'].groupby(cycle_data['started_at'].dt.date).size()
ride_count_by_day_member = cycle_data[cycle_data['member_casual'] == 'member'].groupby(cycle_data['started_at'].dt.date).size()

plt.figure(figsize=(10, 6))
plt.plot(ride_count_by_day_casual.index, ride_count_by_day_casual.values, label='Casual', color='orange')
plt.plot(ride_count_by_day_member.index, ride_count_by_day_member.values, label='Member', color='royalblue')
plt.title("Ride Frequency Over Time")
plt.xlabel("Month")
plt.ylabel("Ride Count")
plt.legend()
plt.grid(True)
plt.show()

ride_count_by_user_type = cycle_data['member_casual'].value_counts()

avg_duration_by_user_type = cycle_data.groupby('member_casual')['ride_duration_minutes'].mean()

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

axes[0].bar(ride_count_by_user_type.index, ride_count_by_user_type.values, color=['blue', 'orange'])
axes[0].set_xlabel('User Type', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Ride Count', fontsize=12, fontweight='bold')
axes[0].set_title('Ride Frequency by User Type', fontsize=14, fontweight='bold')

for i, count in enumerate(ride_count_by_user_type.values):
    axes[0].text(i, count + 50, str(count), ha='center', va='bottom', fontsize=10)

axes[1].bar(avg_duration_by_user_type.index, avg_duration_by_user_type.values, color=['blue', 'orange'])
axes[1].set_xlabel('User Type', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Average Ride Duration (minutes)', fontsize=12, fontweight='bold')
axes[1].set_title('Average Ride Duration by User Type', fontsize=14, fontweight='bold')

for i, duration in enumerate(avg_duration_by_user_type.values):
    axes[1].text(i, duration + 5, f'{duration:.2f}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

ride_count_by_type = cycle_data.groupby(['rideable_type', 'member_casual']).size().unstack(fill_value=0)

num_rideable_types = len(ride_count_by_type)

num_rows = 1
num_cols = num_rideable_types

fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 6))

for i, (rideable_type, counts) in enumerate(ride_count_by_type.iterrows()):
    ax = axes[i] if num_rideable_types > 1 else axes
    ax.pie(counts, labels=ride_count_by_type.columns, autopct='%1.1f%%', startangle=90,
           wedgeprops=dict(width=0.3), textprops=dict(fontsize=12))
    ax.set_title(f'Distribution of Rides by User Type ({rideable_type})', fontsize=14, pad=20)

plt.tight_layout()

plt.show()


import pandas as pd
import matplotlib.pyplot as plt

colors = ['#1f77b4', '#ff7f0e']

ride_count_by_day_casual = cycle_data[cycle_data['member_casual'] == 'casual'].groupby(cycle_data['started_at'].dt.date).size()
ride_count_by_day_member = cycle_data[cycle_data['member_casual'] == 'member'].groupby(cycle_data['started_at'].dt.date).size()

plt.figure(figsize=(10, 6))
plt.plot(ride_count_by_day_casual.index, ride_count_by_day_casual.values, label='Casual', color=colors[0])
plt.plot(ride_count_by_day_member.index, ride_count_by_day_member.values, label='Member', color=colors[1])
plt.title("Ride Frequency Over Time")
plt.xlabel("Month")
plt.ylabel("Ride Count")
plt.legend()
plt.grid(True)
plt.show()

ride_count_by_user_type = cycle_data['member_casual'].value_counts()

avg_duration_by_user_type = cycle_data.groupby('member_casual')['ride_duration_minutes'].mean()

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

axes[0].bar(ride_count_by_user_type.index, ride_count_by_user_type.values, color=colors)
axes[0].set_xlabel('User Type', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Ride Count', fontsize=12, fontweight='bold')
axes[0].set_title('Ride Frequency by User Type', fontsize=14, fontweight='bold')

for i, count in enumerate(ride_count_by_user_type.values):
    axes[0].text(i, count + 50, str(count), ha='center', va='bottom', fontsize=10)

axes[1].bar(avg_duration_by_user_type.index, avg_duration_by_user_type.values, color=colors)
axes[1].set_xlabel('User Type', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Average Ride Duration (minutes)', fontsize=12, fontweight='bold')
axes[1].set_title('Average Ride Duration by User Type', fontsize=14, fontweight='bold')

for i, duration in enumerate(avg_duration_by_user_type.values):
    axes[1].text(i, duration + 5, f'{duration:.2f}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

ride_count_by_type = cycle_data.groupby(['rideable_type', 'member_casual']).size().unstack(fill_value=0)

num_rideable_types = len(ride_count_by_type)

num_rows = 1
num_cols = num_rideable_types

fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 6))

for i, (rideable_type, counts) in enumerate(ride_count_by_type.iterrows()):
    ax = axes[i] if num_rideable_types > 1 else axes
    ax.pie(counts, labels=ride_count_by_type.columns, autopct='%1.1f%%', startangle=90,
           wedgeprops=dict(width=0.3), textprops=dict(fontsize=12), colors=colors)
    ax.set_title(f'Distribution of Rides by User Type ({rideable_type})', fontsize=14, pad=20)

plt.tight_layout()

plt.show()


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

colors = ['#1f77b4', '#ff7f0e']

ride_count_by_day_casual = cycle_data[cycle_data['member_casual'] == 'casual'].groupby(cycle_data['started_at'].dt.date).size()
ride_count_by_day_member = cycle_data[cycle_data['member_casual'] == 'member'].groupby(cycle_data['started_at'].dt.date).size()

plt.figure(figsize=(10, 6))
plt.plot(ride_count_by_day_casual.index, ride_count_by_day_casual.values, label='Casual', color=colors[0])
plt.plot(ride_count_by_day_member.index, ride_count_by_day_member.values, label='Member', color=colors[1])
plt.title("Ride Frequency Over Time")
plt.xlabel("Month")
plt.ylabel("Ride Count")
plt.legend()
plt.grid(True)
plt.show()

ride_count_by_user_type = cycle_data['member_casual'].value_counts()

avg_duration_by_user_type = cycle_data.groupby('member_casual')['ride_duration_minutes'].mean()

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

axes[0].bar(ride_count_by_user_type.index, ride_count_by_user_type.values, color=colors)
axes[0].set_xlabel('User Type', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Ride Count', fontsize=12, fontweight='bold')
axes[0].set_title('Ride Frequency by User Type', fontsize=14, fontweight='bold')

for i, count in enumerate(ride_count_by_user_type.values):
    axes[0].text(i, count + 50, str(count), ha='center', va='bottom', fontsize=10)

axes[1].bar(avg_duration_by_user_type.index, avg_duration_by_user_type.values, color=colors)
axes[1].set_xlabel('User Type', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Average Ride Duration (minutes)', fontsize=12, fontweight='bold')
axes[1].set_title('Average Ride Duration by User Type', fontsize=14, fontweight='bold')

for i, duration in enumerate(avg_duration_by_user_type.values):
    axes[1].text(i, duration + 5, f'{duration:.2f}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

ride_count_by_type = cycle_data.groupby(['rideable_type', 'member_casual']).size().unstack(fill_value=0)

num_rideable_types = len(ride_count_by_type)

num_rows = 1
num_cols = num_rideable_types

fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 6))

for i, (rideable_type, counts) in enumerate(ride_count_by_type.iterrows()):
    ax = axes[i] if num_rideable_types > 1 else axes
    ax.pie(counts, labels=ride_count_by_type.columns, autopct='%1.1f%%', startangle=90,
           wedgeprops=dict(width=0.3), textprops=dict(fontsize=12), colors=colors)
    ax.set_title(f'Distribution of Rides by User Type ({rideable_type})', fontsize=14, pad=20)

plt.tight_layout()

plt.figure(figsize=(12, 8))  
sns.histplot(data=cycle_data, x='day_of_week', hue='member_casual', multiple='stack', palette='coolwarm')  
plt.title('Ride Frequency by Day of Week', fontsize=20, fontweight='bold', color='navy')  
plt.xlabel('Day of Week', fontsize=16, fontweight='bold', color='darkslategray')  
plt.ylabel('Frequency', fontsize=16, fontweight='bold', color='darkslategray')  
plt.legend(labels=['Casual', 'Member'], title='User Type', fontsize=14, title_fontsize=14, labelcolor=['orange', 'royalblue']) 
plt.xticks(rotation=45, fontsize=14, color='dimgray')  
plt.yticks(fontsize=14, color='dimgray')  
plt.tight_layout()  
plt.show()


pip install dash

Requirement already satisfied: dash in c:\users\jatin gagwani\anaconda3\lib\site-packages (2.16.1)Note: you may need to restart the kernel to use updated packages.

Requirement already satisfied: Flask<3.1,>=1.0.4 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (1.1.2)
Requirement already satisfied: plotly>=5.0.0 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (5.9.0)
Requirement already satisfied: retrying in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (1.3.4)
Requirement already satisfied: typing-extensions>=4.1.1 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (4.3.0)
Requirement already satisfied: setuptools in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (63.4.1)
Requirement already satisfied: dash-table==5.0.0 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (5.0.0)
Requirement already satisfied: Werkzeug<3.1 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (2.0.3)
Requirement already satisfied: importlib-metadata in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (4.11.3)
Requirement already satisfied: dash-core-components==2.0.0 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (2.0.0)
Requirement already satisfied: nest-asyncio in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (1.5.5)
Requirement already satisfied: requests in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (2.28.1)
Requirement already satisfied: dash-html-components==2.0.0 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from dash) (2.0.0)
Requirement already satisfied: itsdangerous>=0.24 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from Flask<3.1,>=1.0.4->dash) (2.0.1)
Requirement already satisfied: click>=5.1 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from Flask<3.1,>=1.0.4->dash) (8.0.4)
Requirement already satisfied: Jinja2>=2.10.1 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from Flask<3.1,>=1.0.4->dash) (3.1.3)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from plotly>=5.0.0->dash) (8.0.1)
Requirement already satisfied: zipp>=0.5 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from importlib-metadata->dash) (3.8.0)
Requirement already satisfied: charset-normalizer<3,>=2 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from requests->dash) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from requests->dash) (3.3)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from requests->dash) (2024.2.2)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from requests->dash) (1.26.11)
Requirement already satisfied: six>=1.7.0 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from retrying->dash) (1.16.0)
Requirement already satisfied: colorama in c:\users\jatin gagwani\anaconda3\lib\site-packages (from click>=5.1->Flask<3.1,>=1.0.4->dash) (0.4.5)
Requirement already satisfied: MarkupSafe>=2.0 in c:\users\jatin gagwani\anaconda3\lib\site-packages (from Jinja2>=2.10.1->Flask<3.1,>=1.0.4->dash) (2.0.1)


import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(rows=3, cols=2, subplot_titles=("Ride Frequency by User Type",
                                                     "Average Ride Duration by User Type",
                                                     "Popular Starting Locations",
                                                     "Popular Ending Locations",
                                                     "Ride Duration Distribution by User Type",
                                                     "Ride Frequency Over Time"))

ride_count_by_user_type = cycle_data['member_casual'].value_counts()

fig.add_trace(go.Bar(x=ride_count_by_user_type.index, y=ride_count_by_user_type.values, 
                     marker_color=['blue', 'orange']), row=1, col=1)
fig.update_xaxes(title_text="User Type", row=1, col=1)
fig.update_yaxes(title_text="Ride Count", row=1, col=1)

avg_duration_by_user_type = cycle_data.groupby('member_casual')['ride_duration_minutes'].mean()

fig.add_trace(go.Bar(x=avg_duration_by_user_type.index, y=avg_duration_by_user_type.values, 
                     marker_color=['blue', 'orange']), row=1, col=2)
fig.update_xaxes(title_text="User Type", row=1, col=2)
fig.update_yaxes(title_text="Average Ride Duration (minutes)", row=1, col=2)

starting_locations = cycle_data['start_station_name'].value_counts().nlargest(5)
fig.add_trace(go.Bar(x=starting_locations.index, y=starting_locations.values, 
                     marker_color='green'), row=2, col=1)
fig.update_xaxes(title_text="Start Station", row=2, col=1)
fig.update_yaxes(title_text="Frequency", row=2, col=1)

ending_locations = cycle_data['end_station_name'].value_counts().nlargest(5)
fig.add_trace(go.Bar(x=ending_locations.index, y=ending_locations.values, 
                     marker_color='green'), row=2, col=2)
fig.update_xaxes(title_text="End Station", row=2, col=2)
fig.update_yaxes(title_text="Frequency", row=2, col=2)

fig.add_trace(go.Histogram(x=cycle_data[cycle_data['member_casual'] == 'member']['ride_duration_minutes'], 
                            name='Member', marker_color='blue', opacity=0.7), row=3, col=1)
fig.add_trace(go.Histogram(x=cycle_data[cycle_data['member_casual'] == 'casual']['ride_duration_minutes'], 
                            name='Casual', marker_color='orange', opacity=0.7), row=3, col=1)
fig.update_xaxes(title_text="Ride Duration (minutes)", row=3, col=1)
fig.update_yaxes(title_text="Frequency", row=3, col=1)

ride_count_by_day = cycle_data.groupby(cycle_data['started_at'].dt.date).size()

fig.add_trace(go.Scatter(x=ride_count_by_day.index, y=ride_count_by_day.values, 
                         mode='lines', marker_color='purple'), row=3, col=2)
fig.update_xaxes(title_text="Date", row=3, col=2)
fig.update_yaxes(title_text="Ride Count", row=3, col=2)

fig.update_layout(title_text="Combined Dashboard")

fig.write_html("combined_dashboard.html")

	ride_id	rideable_type	started_at	ended_at	start_station_name	start_station_id	end_station_name	end_station_id	start_lat	start_lng	end_lat	end_lng	member_casual
0	C2F7DD78E82EC875	electric_bike	2022-01-13 11:59:47	2022-01-13 12:02:44	Glenwood Ave & Touhy Ave	525	Clark St & Touhy Ave	RP-007	42.012800	-87.665906	42.012560	-87.674367	casual
1	A6CF8980A652D272	electric_bike	2022-01-10 08:41:56	2022-01-10 08:46:17	Glenwood Ave & Touhy Ave	525	Clark St & Touhy Ave	RP-007	42.012763	-87.665967	42.012560	-87.674367	casual
2	BD0F91DFF741C66D	classic_bike	2022-01-25 04:53:40	2022-01-25 04:58:01	Sheffield Ave & Fullerton Ave	TA1306000016	Greenview Ave & Fullerton Ave	TA1307000001	41.925602	-87.653708	41.925330	-87.665800	member
3	CBB80ED419105406	classic_bike	2022-01-04 00:18:04	2022-01-04 00:33:00	Clark St & Bryn Mawr Ave	KA1504000151	Paulina St & Montrose Ave	TA1309000021	41.983593	-87.669154	41.961507	-87.671387	casual
4	DDC963BFDDA51EEA	classic_bike	2022-01-20 01:31:10	2022-01-20 01:37:12	Michigan Ave & Jackson Blvd	TA1309000002	State St & Randolph St	TA1305000029	41.877850	-87.624080	41.884621	-87.627834	member

DA | Cyclistic Bike Sharing | Google Capstone¶