import pandas as pd
import matplotlib.pyplot as plt


files = "/content/drive/MyDrive/Data Analyst Project/revou-cs3/games_sales.csv"
df = pd.read_csv(files)
df.head(5)


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Name       177 non-null    object 
 1   Sales      177 non-null    float64
 2   Series     141 non-null    object 
 3   Release    177 non-null    object 
 4   Genre      177 non-null    object 
 5   Developer  177 non-null    object 
 6   Publisher  177 non-null    object 
dtypes: float64(1), object(6)
memory usage: 9.8+ KB


splits = df['Name'].str.split()
s_cl = splits.str[0]

df['Series'].fillna(s_cl, inplace = True)


df.head()


df['Release'] = pd.to_datetime(df['Release'])


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Name       177 non-null    object        
 1   Sales      177 non-null    float64       
 2   Series     177 non-null    object        
 3   Release    177 non-null    datetime64[ns]
 4   Genre      177 non-null    object        
 5   Developer  177 non-null    object        
 6   Publisher  177 non-null    object        
dtypes: datetime64[ns](1), float64(1), object(5)
memory usage: 9.8+ KB


# Game release sorted ascending (from oldest)
gr_old = df.sort_values(by='Release', ascending=True)

# Print 10 oldest games
print(gr_old[['Name', 'Release','Series','Publisher']][:10])

                                       Name    Release                Series  \
133                                 Hydlide 1984-12-01               Hydlide   
34   Where in the World Is Carmen Sandiego? 1985-06-01       Carmen Sandiego   
88                     International Karate 1985-11-01  International Karate   
162                                  Tetris 1988-01-01                Tetris   
22                             Last Ninja 2 1988-08-01        The Last Ninja   
151                                 RoboCop 1988-12-01               RoboCop   
30                                 Populous 1989-06-01              Populous   
150                          Return to Zork 1993-08-01                  Zork   
51                                     Myst 1993-09-01                  Myst   
156                Star Wars: Rebel Assault 1993-11-01             Star Wars   

                                 Publisher  
133  Technology and Entertainment Software  
34                              Broderbund  
88                                    Epyx  
162                      Spectrum HoloByte  
22                              Activision  
151              Data East, Ocean Software  
30                         Electronic Arts  
150                             Activision  
51                              Brøderbund  
156                              LucasArts


# Print 10 newest games
print(gr_old[['Name', 'Release','Series','Publisher']][:-10:-1])

                          Name    Release          Series  \
20                     Valheim 2021-02-01         Valheim   
27              Cyberpunk 2077 2020-12-01       Cyberpunk   
110         Crusader Kings III 2020-09-01  Crusader Kings   
10                   Fall Guys 2020-08-01            Fall   
139                    Mordhau 2019-04-01         Mordhau   
92                Satisfactory 2019-03-01    Satisfactory   
26                  The Forest 2018-04-01             The   
13                        Rust 2018-02-01            Rust   
96   Kingdom Come: Deliverance 2018-02-01         Kingdom   

                   Publisher  
20   Coffee Stain Publishing  
27                CD Projekt  
110      Paradox Interactive  
10          Devolver Digital  
139               Triternion  
92   Coffee Stain Publishing  
26            Endnight Games  
13         Facepunch Studios  
96          Warhorse Studios


# Count how many records are in developers column, group by itself
dev = df.groupby(['Developer'])['Developer'].count()

# Top N using .nlargest() in Pandas
top5_dev = dev.nlargest(5)

# Plot the top N into the horizontal bar chart in Pandas
dev_g = top5_dev.plot(kind='barh', figsize=(10, 6))
plt.title("Top 5 Productive Developers")
plt.xlabel("Total Games")
plt.ylabel("Publisher")
plt.show()


# Count how many records are in publisher column, group by itself
pub = df.groupby(['Publisher'])['Publisher'].count()

# Top N using .nlargest() in Pandas
top5_pub = pub.nlargest(5)

# Plot the top N into a horizontal bar chart in Pandas
pub_g = top5_pub.plot(kind='barh', figsize=(10, 6))

# Show the value label using .bar_label() in Matplotlib
plt.title("Top 5 Productive Publishers")
plt.xlabel("Total Games")
plt.ylabel("Developer")
pub_g.bar_label(pub_g.containers[0])
plt.show()


# Make a raw dataframe to exclude null or NaN values in series column aggregation
raw = pd.read_csv(files)

# Count how many records are in series column, group by itself
sg = raw.groupby(['Series'])['Series'].count()

# Top N using .nlargest() in Pandas
top5_sg = sg.nlargest(5)

# Plot the top N into a horizontal bar in Pandas
sg_bar = top5_sg.plot(kind='barh', figsize=(10, 6))

plt.title("Top 5 Series with The Most Games")
plt.xlabel("Total Games")
plt.ylabel("Series")
plt.show()


# Calculate total sales group by series
s_sales = raw.groupby(['Series'])['Sales'].sum()

# Sort the top N in series sales
top5_sales = s_sales.nlargest(5)

# Plot the top N sales into a bar chart
sales_g = top5_sales.plot(kind='barh', figsize=(10, 6))
plt.title("Top 5 Series with Highest Sales")
plt.xlabel("Total Sales (in million USD)")
plt.ylabel("Series")
sales_g.bar_label(sales_g.containers[0])
plt.show()

	Name	Sales	Series	Release	Genre	Developer	Publisher
0	PlayerUnknown's Battlegrounds	42.0	NaN	12/1/2017	Battle royale	PUBG Studios	Krafton
1	Minecraft	33.0	Minecraft	11/1/2011	Sandbox, survival	Mojang Studios	Mojang Studios
2	Diablo III	20.0	Diablo	5/1/2012	Action role-playing	Blizzard Entertainment	Blizzard Entertainment
3	Garry's Mod	20.0	NaN	11/1/2006	Sandbox	Facepunch Studios	Valve
4	Terraria	17.2	NaN	5/1/2011	Action-adventure	Re-Logic	Re-Logic

	Name	Sales	Series	Release	Genre	Developer	Publisher
0	PlayerUnknown's Battlegrounds	42.0	PlayerUnknown's	12/1/2017	Battle royale	PUBG Studios	Krafton
1	Minecraft	33.0	Minecraft	11/1/2011	Sandbox, survival	Mojang Studios	Mojang Studios
2	Diablo III	20.0	Diablo	5/1/2012	Action role-playing	Blizzard Entertainment	Blizzard Entertainment
3	Garry's Mod	20.0	Garry's	11/1/2006	Sandbox	Facepunch Studios	Valve
4	Terraria	17.2	Terraria	5/1/2011	Action-adventure	Re-Logic	Re-Logic

Case Study 3: Games Sales¶

RevoU DAMC¶

Introduction¶

1. Ask¶

1.1 Business Objectives¶

1.2 Business Tasks¶

2. Prepare¶

2.1 Work Steps¶

3. Process¶

3.1 Data Cleaning¶

4. Analyze¶

4.1 Oldest and Newest Games¶

4.2 Most Productive Developers and Publishers¶

4.3 Game Series and Sales¶

Act¶

Case Study 3: Games Sales¶

RevoU DAMC¶

Introduction¶

1. Ask¶

1.1 Business Objectives¶

1.2 Business Tasks¶

2. Prepare¶

2.1 Work Steps¶

3. Process¶

3.1 Data Cleaning¶

4. Analyze¶

4.1 Oldest and Newest Games¶

4.2 Most Productive Developers and Publishers¶

4.3 Game Series and Sales¶

Share¶

Act¶