# Pandas

In [1]:
import numpy as np
import pandas as pd
print("Pandas version:", pd.__version__)

Pandas version: 2.3.3


## 1. Series: one-dimensional data

### 1.1. Creating series 

In [2]:
temp = pd.Series([22, 25, 23, 28, 24])
temp

0    22
1    25
2    23
3    28
4    24
dtype: int64

In [3]:
type(temp)

pandas.core.series.Series

In [4]:
day = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri']
temp = pd.Series([22, 25, 23, 28, 24], index=day)

temp

Mon      22
Tues     25
Wed      23
Thurs    28
Fri      24
dtype: int64

### 1.2. Accessing series data

In [5]:
print(temp.iloc[2])

23


In [6]:
print(temp['Wed'])

23


### 1.3. Series operations

In [7]:
print(temp.mean())

24.4


In [8]:
print(temp.max())

28


## 2. DataFrame: two-dimensional data

### 2.1. Creating dataframes

In [9]:
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'age': [25, 30, 35, 28],
    'city': ['New York', 'Paris', 'London', 'Tokyo'],
    'salary': [50000, 60000, 55000, 58000]
}

type(data)

dict

In [10]:
df = pd.DataFrame(data)
df

Unnamed: 0,name,age,city,salary
0,Alice,25,New York,50000
1,Bob,30,Paris,60000
2,Charlie,35,London,55000
3,Diana,28,Tokyo,58000


### 2.2. Setting the index

In [11]:
index = [1, 2, 3, 4]
df.set_index(pd.Index(index), inplace=True)
df

Unnamed: 0,name,age,city,salary
1,Alice,25,New York,50000
2,Bob,30,Paris,60000
3,Charlie,35,London,55000
4,Diana,28,Tokyo,58000


In [12]:
df

Unnamed: 0,name,age,city,salary
1,Alice,25,New York,50000
2,Bob,30,Paris,60000
3,Charlie,35,London,55000
4,Diana,28,Tokyo,58000


In [13]:
# Remove the index
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,name,age,city,salary
0,Alice,25,New York,50000
1,Bob,30,Paris,60000
2,Charlie,35,London,55000
3,Diana,28,Tokyo,58000


In [14]:
df

Unnamed: 0,name,age,city,salary
0,Alice,25,New York,50000
1,Bob,30,Paris,60000
2,Charlie,35,London,55000
3,Diana,28,Tokyo,58000


### 2.3. Getting and setting column names

In [15]:
# Get column names
df.columns

Index(['name', 'age', 'city', 'salary'], dtype='object')

In [16]:
# Rename specific columns(s)
rename_dict = {'city': 'location'}

df = df.rename(columns=rename_dict)
df

Unnamed: 0,name,age,location,salary
0,Alice,25,New York,50000
1,Bob,30,Paris,60000
2,Charlie,35,London,55000
3,Diana,28,Tokyo,58000


In [17]:
# Set all column names
df.columns = ['full_name', 'age_years', 'location', 'annual_salary']
df

Unnamed: 0,full_name,age_years,location,annual_salary
0,Alice,25,New York,50000
1,Bob,30,Paris,60000
2,Charlie,35,London,55000
3,Diana,28,Tokyo,58000


### 2.4. Removing rows or columns

In [18]:
# Make a copy of the dataframe so we can experiment without changing the original
drop_df = df.copy()
drop_df

Unnamed: 0,full_name,age_years,location,annual_salary
0,Alice,25,New York,50000
1,Bob,30,Paris,60000
2,Charlie,35,London,55000
3,Diana,28,Tokyo,58000


In [19]:
# Drop a column
drop_df.drop('annual_salary', axis=1, inplace=True)
drop_df

Unnamed: 0,full_name,age_years,location
0,Alice,25,New York
1,Bob,30,Paris
2,Charlie,35,London
3,Diana,28,Tokyo


In [20]:
# Drop a row
drop_df.drop(2, axis=0, inplace=True)
drop_df

Unnamed: 0,full_name,age_years,location
0,Alice,25,New York
1,Bob,30,Paris
3,Diana,28,Tokyo


### 2.5. Getting information about dataFrames

In [21]:
df.head(2)

Unnamed: 0,full_name,age_years,location,annual_salary
0,Alice,25,New York,50000
1,Bob,30,Paris,60000


In [22]:
df.tail(2)

Unnamed: 0,full_name,age_years,location,annual_salary
2,Charlie,35,London,55000
3,Diana,28,Tokyo,58000


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   full_name      4 non-null      object
 1   age_years      4 non-null      int64 
 2   location       4 non-null      object
 3   annual_salary  4 non-null      int64 
dtypes: int64(2), object(2)
memory usage: 260.0+ bytes


In [24]:
df.describe()

Unnamed: 0,age_years,annual_salary
count,4.0,4.0
mean,29.5,55750.0
std,4.203173,4349.32945
min,25.0,50000.0
25%,27.25,53750.0
50%,29.0,56500.0
75%,31.25,58500.0
max,35.0,60000.0


### 2.6. Selecting data

In [25]:
# Select a column
df['full_name']

0      Alice
1        Bob
2    Charlie
3      Diana
Name: full_name, dtype: object

In [26]:
# Select multiple columns
df[['full_name', 'location']]

Unnamed: 0,full_name,location
0,Alice,New York
1,Bob,Paris
2,Charlie,London
3,Diana,Tokyo


In [27]:
# Set a new index
index = ['first', 'second', 'third', 'fourth']
df = df.set_index(pd.Index(index))
df

Unnamed: 0,full_name,age_years,location,annual_salary
first,Alice,25,New York,50000
second,Bob,30,Paris,60000
third,Charlie,35,London,55000
fourth,Diana,28,Tokyo,58000


In [28]:
# Select a row by its index label
df.loc['first']

full_name           Alice
age_years              25
location         New York
annual_salary       50000
Name: first, dtype: object

In [29]:
# Select multiple rows by their index labels
df.loc[['first', 'second']]

Unnamed: 0,full_name,age_years,location,annual_salary
first,Alice,25,New York,50000
second,Bob,30,Paris,60000


In [30]:
# Select specific rows and columns
df.loc[['first', 'second'], ['full_name', 'annual_salary']]

Unnamed: 0,full_name,annual_salary
first,Alice,50000
second,Bob,60000


### 2.7. Filtering data

In [31]:
# Select rows based on a condition
df[df['annual_salary'] > 50000]

Unnamed: 0,full_name,age_years,location,annual_salary
second,Bob,30,Paris,60000
third,Charlie,35,London,55000
fourth,Diana,28,Tokyo,58000


In [32]:
# Select rows where city is Tokyo
df[df['location'] == 'Tokyo']

Unnamed: 0,full_name,age_years,location,annual_salary
fourth,Diana,28,Tokyo,58000


In [33]:
salary_thresh = df['annual_salary'] > 50000
df[salary_thresh]

Unnamed: 0,full_name,age_years,location,annual_salary
second,Bob,30,Paris,60000
third,Charlie,35,London,55000
fourth,Diana,28,Tokyo,58000


## 3. Handling missing data

In [43]:
# Create data with missing values
data_with_missing = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'age': [25, np.nan, 35, 28],
    'salary': [50000, 60000, np.nan, 58000]
})

data_with_missing

Unnamed: 0,name,age,salary
0,Alice,25.0,50000.0
1,Bob,,60000.0
2,Charlie,35.0,
3,Diana,28.0,58000.0


In [44]:
# Null values in info()
data_with_missing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    4 non-null      object 
 1   age     3 non-null      float64
 2   salary  3 non-null      float64
dtypes: float64(2), object(1)
memory usage: 228.0+ bytes


In [45]:
# Finding null values
data_with_missing.isnull()

Unnamed: 0,name,age,salary
0,False,False,False
1,False,True,False
2,False,False,True
3,False,False,False


In [46]:
# Finding null value counts by feature
data_with_missing.isnull().sum()

name      0
age       1
salary    1
dtype: int64

In [47]:
# Filling null values with a specific value
data_with_missing.fillna(0)

Unnamed: 0,name,age,salary
0,Alice,25.0,50000.0
1,Bob,0.0,60000.0
2,Charlie,35.0,0.0
3,Diana,28.0,58000.0


In [48]:
# Filling null values with the mean of the column
data_with_missing['age'].fillna(data_with_missing['age'].mean())

0    25.000000
1    29.333333
2    35.000000
3    28.000000
Name: age, dtype: float64

## 4. Working with data types

In [49]:
mixed_data = pd.DataFrame({
    'numbers_as_text': ['1', '2', '3', '4', '5'],
    'prices': ['10.50', '20.75', '15.25', '30.00', '25.50'],
    'categories': ['A', 'B', 'A', 'C', 'B'],
    'is_active': ['True', 'False', 'True', 'True', 'False'],
    'dates_as_text': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05']
})

mixed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   numbers_as_text  5 non-null      object
 1   prices           5 non-null      object
 2   categories       5 non-null      object
 3   is_active        5 non-null      object
 4   dates_as_text    5 non-null      object
dtypes: object(5)
memory usage: 332.0+ bytes


In [50]:
mixed_data['prices'] = mixed_data['prices'].astype(float)
mixed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   numbers_as_text  5 non-null      object 
 1   prices           5 non-null      float64
 2   categories       5 non-null      object 
 3   is_active        5 non-null      object 
 4   dates_as_text    5 non-null      object 
dtypes: float64(1), object(4)
memory usage: 332.0+ bytes


In [51]:
mixed_data['prices'] = mixed_data['prices'].astype(np.float32)
mixed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   numbers_as_text  5 non-null      object 
 1   prices           5 non-null      float32
 2   categories       5 non-null      object 
 3   is_active        5 non-null      object 
 4   dates_as_text    5 non-null      object 
dtypes: float32(1), object(4)
memory usage: 312.0+ bytes
