import pandas as pd
import numpy as np | Import pandas and numpy |
df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) | Create from dict |
df = pd.DataFrame(data, columns=['A', 'B']) | Create with column names |
s = pd.Series([1, 2, 3, 4]) | Create Series |
s = pd.Series([1, 2], index=['a', 'b']) | Series with custom index |
df = pd.DataFrame(np.random.randn(5, 3)) | Create from numpy array |
df = pd.read_csv('file.csv') | Read CSV |
df = pd.read_csv('file.csv', sep=';', header=0) | Read with options |
df = pd.read_excel('file.xlsx', sheet_name='Sheet1') | Read Excel |
df = pd.read_json('file.json') | Read JSON |
df = pd.read_sql('SELECT * FROM table', conn) | Read from SQL |
df = pd.read_parquet('file.parquet') | Read Parquet |
df.to_csv('output.csv', index=False) | Write CSV |
df.to_excel('output.xlsx', index=False) | Write Excel |
df.to_json('output.json', orient='records') | Write JSON |
df.to_parquet('output.parquet') | Write Parquet |
df.head() | First 5 rows |
df.tail(10) | Last 10 rows |
df.shape | Rows and columns count |
df.info() | DataFrame info |
df.describe() | Statistical summary |
df.dtypes | Column data types |
df.columns | Column names |
df.index | Index |
df.values | Data as numpy array |
df.memory_usage() | Memory usage |
df['A'] | Select column as Series |
df[['A', 'B']] | Select multiple columns |
df.A | Dot notation (simple names) |
df.columns.tolist() | Get column names as list |
df[0:5] | First 5 rows by position |
df.iloc[0] | First row by position |
df.iloc[0:5] | Rows 0-4 by position |
df.iloc[[0, 2, 4]] | Specific rows by position |
df.loc['row_label'] | Row by label |
df.loc['a':'c'] | Rows by label range |
df.loc[df.A > 0] | Rows by condition |
df.loc[0, 'A'] | Cell by label |
df.iloc[0, 0] | Cell by position |
df.at[0, 'A'] | Fast scalar access by label |
df.iat[0, 0] | Fast scalar access by position |
df.loc[0:2, 'A':'C'] | Slice rows and columns |
df.iloc[0:2, 0:3] | Slice by position |
df[df.A > 0] | Filter by condition |
df[(df.A > 0) & (df.B < 5)] | Multiple conditions (AND) |
df[(df.A > 0) | (df.B < 0)] | Multiple conditions (OR) |
df[~(df.A > 0)] | Negate condition |
df[df.A.isin([1, 2, 3])] | Filter by list |
df[df.A.str.contains('pattern')] | String contains |
df[df.A.notna()] | Filter non-null |
df.query("A > 0 and B < 5") | Query string |
df['C'] = df.A + df.B | Add new column |
df['D'] = 0 | Add column with constant |
df.insert(1, 'new_col', values) | Insert at position |
df.assign(E=df.A * 2) | Assign new column (returns copy) |
df['A'] = df.A.apply(lambda x: x * 2) | Apply function |
df.A = df.A.astype(int) | Change data type |
df.rename(columns={'A': 'a', 'B': 'b'}) | Rename columns |
df.drop('A', axis=1) | Drop column |
df.drop(['A', 'B'], axis=1) | Drop multiple columns |
df.drop(0, axis=0) | Drop row by index |
df.drop([0, 1, 2]) | Drop multiple rows |
df.drop_duplicates() | Drop duplicate rows |
df.drop_duplicates(subset=['A']) | Drop duplicates by column |
df.dropna() | Drop rows with NaN |
df.dropna(subset=['A']) | Drop NaN in specific columns |
df.isna() | Check for NaN |
df.isna().sum() | Count NaN per column |
df.fillna(0) | Fill NaN with value |
df.fillna(method='ffill') | Forward fill |
df.fillna(method='bfill') | Backward fill |
df.fillna(df.mean()) | Fill with mean |
df.interpolate() | Interpolate missing values |
df.replace({'old': 'new'}) | Replace values |
df.sort_values('A') | Sort by column |
df.sort_values('A', ascending=False) | Sort descending |
df.sort_values(['A', 'B']) | Sort by multiple columns |
df.sort_index() | Sort by index |
df.nlargest(5, 'A') | Top 5 by column |
df.nsmallest(5, 'A') | Bottom 5 by column |
df.sum() | Sum of each column |
df.mean() | Mean of each column |
df.median() | Median of each column |
df.std() | Standard deviation |
df.var() | Variance |
df.min() | Min of each column |
df.max() | Max of each column |
df.count() | Count non-null values |
df.nunique() | Count unique values |
df.A.value_counts() | Count of each value |
df.groupby('A').sum() | Group and sum |
df.groupby('A').mean() | Group and mean |
df.groupby(['A', 'B']).count() | Group by multiple columns |
df.groupby('A').agg(['sum', 'mean']) | Multiple aggregations |
df.groupby('A').agg({'B': 'sum', 'C': 'mean'}) | Different agg per column |
df.groupby('A').transform('mean') | Transform (keeps shape) |
df.groupby('A').apply(lambda x: x.nlargest(2, 'B')) | Apply custom function |
df.groupby('A').filter(lambda x: x.B.mean() > 0) | Filter groups |
df.pivot(index='A', columns='B', values='C') | Pivot table |
df.pivot_table(values='C', index='A', columns='B', aggfunc='mean') | Pivot with aggregation |
pd.melt(df, id_vars=['A'], value_vars=['B', 'C']) | Unpivot (wide to long) |
df.stack() | Stack columns to rows |
df.unstack() | Unstack rows to columns |
df.T | Transpose |
pd.merge(df1, df2, on='key') | Merge on column |
pd.merge(df1, df2, on=['key1', 'key2']) | Merge on multiple columns |
pd.merge(df1, df2, left_on='a', right_on='b') | Merge different column names |
pd.merge(df1, df2, how='left') | Left join |
pd.merge(df1, df2, how='right') | Right join |
pd.merge(df1, df2, how='outer') | Outer join |
pd.merge(df1, df2, how='inner') | Inner join (default) |
df1.join(df2, on='key') | Join on index |
pd.concat([df1, df2]) | Concatenate rows |
pd.concat([df1, df2], axis=1) | Concatenate columns |
pd.concat([df1, df2], ignore_index=True) | Reset index after concat |
pd.concat([df1, df2], keys=['a', 'b']) | Concat with keys |
df1.append(df2) | Append rows (deprecated) |
df['date'] = pd.to_datetime(df['date']) | Convert to datetime |
pd.to_datetime('2024-01-01') | Parse date string |
pd.to_datetime(df['date'], format='%Y-%m-%d') | Parse with format |
df.date.dt.year | Extract year |
df.date.dt.month | Extract month |
df.date.dt.day | Extract day |
df.date.dt.dayofweek | Day of week (0=Mon) |
df.date.dt.hour | Extract hour |
df.date.dt.strftime('%Y-%m-%d') | Format to string |
pd.date_range('2024-01-01', periods=10) | Create date range |
pd.date_range('2024-01-01', '2024-12-31', freq='M') | Monthly date range |
df.set_index('date').resample('M').mean() | Resample monthly |
df.resample('D').sum() | Resample daily |
df.rolling(window=7).mean() | 7-day rolling mean |
df.shift(1) | Shift by 1 period |
df.diff() | Difference from previous |
df.pct_change() | Percent change |