DataFrame
A two-dimensional table data structure in pandas.
See also: pandas
DataFrame Methods
Comprehensive reference for DataFrame methods and attributes.
Attributes
T
The transpose of the DataFrame.
df.T # transposed DataFrameat
Access a single value for a row/column label pair.
df.at['row', 'col'] # get/set single value
df.at['row', 'col'] = 5attrs
Dictionary of global attributes.
df.attrs # global attributes dictaxes
Return a list representing the axes.
df.axes # [row_index, column_index]columns
The column labels.
df.columns # Index of columnsdtypes
Return the dtypes in the DataFrame.
df.dtypes # Series of dtypesempty
Indicator whether DataFrame is empty.
df.empty # True/Falseflags
Get properties associated with this pandas object.
df.flags # flags objectiat
Access single value by integer position.
df.iat[0, 0] # row 0, col 0iloc
Purely integer-location based indexing.
df.iloc[0, 1] # row 0, col 1
df.iloc[0:5, :] # first 5 rowsindex
The index (row labels).
df.index # Index of rowsloc
Access by labels or boolean array.
df.loc['row_name', 'col_name'] # single
df.loc[:, 'col_name'] # column
df.loc[df['col'] > 0] # booleanndim
Number of axes/dimensions.
df.ndim # 2 for DataFrameshape
Tuple representing dimensionality.
df.shape # (rows, cols)size
Number of elements.
df.size # rows * colsstyle
Returns Styler object.
df.style # Styler for formattingvalues
Numpy representation.
df.values # numpy arrayMethods
abs()
Return absolute numeric values.
df.abs() # absolute valuesadd(other, axis, level, fill_value)
Addition of DataFrame and other.
df.add(other_df)
df.add(5)add_prefix(prefix, axis)
Prefix labels with string.
df.add_prefix('col_') # prefix columns
df.add_prefix('row_', axis=0)add_suffix(suffix, axis)
Suffix labels with string.
df.add_suffix('_col')agg(func, axis)
Aggregate using operations over axis.
df.agg('sum')
df.agg({'col1': 'sum', 'col2': 'mean'})aggregate(func, axis)
Alias for agg.
df.aggregate('sum')align(other, join, axis, level, copy, …)
Align two objects on their axes.
df1.align(df2, join='outer')all(axis, bool_only, skipna)
Return whether all elements are True.
df.all()
df.all(axis=1)any(axis, bool_only, skipna)
Return whether any element is True.
df.any()apply(func, axis, raw, result_type, args, …)
Apply function along axis.
df.apply(np.sum)
df.apply(lambda x: x * 2)
df.apply('mean')applymap(func, na_action)
Apply function to each element.
df.applymap(lambda x: str(x))asfreq(freq, method, how, normalize, …)
Convert time series to specified frequency.
df.asfreq('D') # dailyasof(where, subset)
Return last row without NaNs before where.
df.asof(where_date)assign(**kwargs)
Assign new columns.
df.assign(new_col=df['a'] + df['b'])astype(dtype, copy, errors)
Cast to specified dtype.
df.astype({'col1': float})
df.astype('int32')at_time(time, asof, axis)
Select values at particular time of day.
df.at_time('9:30')backfill()
Fill NA with next valid observation.
df.backfill()between_time(start_time, end_time, …)
Select values between times.
df.between_time('9:00', '9:30')bfill()
Backfill NA values.
df.bfill()bool()
Return bool of single element.
df.bool()boxplot(column, by, ax, fontsize, rot, …)
Make a box plot.
df.boxplot()clip(lower, upper, axis, inplace)
Trim values at threshold.
df.clip(0, 100)
df.clip(lower=0)combine(other, func, fill_value, overwrite)
Column-wise combine.
df.combine(other_df, lambda x, y: x if x > y else y)combine_first(other)
Update nulls with other values.
df.combine_first(other_df)compare(other, align_axis, keep_shape, …)
Compare DataFrames and show differences.
df.compare(other_df)convert_dtypes(infer_objects, convert_string, …)
Convert to best dtypes.
df.convert_dtypes()copy(deep)
Make a copy.
df.copy()
df.copy(deep=True)corr(method, min_periods, numeric_only)
Compute pairwise correlation.
df.corr()
df.corr('pearson')corrwith(other, axis, drop, method, …)
Compute pairwise correlation.
df.corrwith(other_df)count(axis, numeric_only)
Count non-NA cells.
df.count()
df.count(axis=1)cov(min_periods, ddof, numeric_only)
Compute pairwise covariance.
df.cov()cummax(axis, skipna)
Cumulative maximum.
df.cummax()
df.cummax(axis=1)cummin(axis, skipna)
Cumulative minimum.
df.cummin()cumprod(axis, skipna)
Cumulative product.
df.cumprod()cumsum(axis, skipna)
Cumulative sum.
df.cumsum()describe(percentiles, include, exclude)
Generate descriptive statistics.
df.describe()
df.describe(include='object')diff(periods, axis)
First discrete difference.
df.diff()
df.diff(periods=2)div(other, axis, level, fill_value)
Floating division.
df.div(2)
df.div(other_df)divide(other, axis, level, fill_value)
Alias for div.
df.divide(2)dot(other)
Matrix multiplication.
df.dot(other_df)drop(labels, axis, index, columns, level, …)
Drop specified labels.
df.drop('col', axis=1)
df.drop(['row1', 'row2'])drop_duplicates(subset, keep, inplace, …)
Remove duplicate rows.
df.drop_duplicates()
df.drop_duplicates(subset=['col'])droplevel(level, axis)
Remove index level.
df.droplevel(0)dropna(axis, how, thresh, subset, …)
Remove missing values.
df.dropna()
df.dropna(thresh=2) # keep rows with 2+ non-NAduplicated(subset, keep)
Mark duplicate rows.
df.duplicated()eq(other, axis, level)
Element-wise equal.
df.eq(other_df)
df.eq(5)equals(other)
Test if objects contain same elements.
df.equals(other_df)eval(expr, inplace)
Evaluate string operations on columns.
df.eval('a + b')
df.eval('a > b', inplace=True)ewm(com, span, halflife, alpha, …)
Exponentially weighted calculations.
df.ewm(span=12)expanding(min_periods, axis, method)
Provide expanding window calculations.
df.expanding().sum()explode(column, ignore_index)
Transform list elements to rows.
df.explode('col')ffill()
Forward fill NA values.
df.ffill()fillna(value, method, axis, inplace, …)
Fill NA values.
df.fillna(0)
df.fillna(method='ffill')filter(items, like, regex, axis)
Subset rows/columns.
df.filter(like='test')
df.filter(regex='^a')first(offset)
Select initial periods.
df.first('5D')first_valid_index()
Return index of first non-NA.
df.first_valid_index()floordiv(other, axis, level, fill_value)
Integer division.
df.floordiv(2)from_dict(data, orient, dtype, columns)
Construct from dict.
pd.from_dict({'a': [1, 2], 'b': [3, 4]})from_records(data, index, exclude, …)
Convert structured array to DataFrame.
pd.from_records(array)ge(other, axis, level)
Greater than or equal.
df.ge(5)get(key, default)
Get column or default.
df.get('col')
df.get('missing', default=0)groupby(by, axis, level, as_index, sort, …)
Group DataFrame.
df.groupby('col').sum()
df.groupby(['col1', 'col2']).mean()gt(other, axis, level)
Greater than.
df.gt(5)head(n)
Return first n rows.
df.head(10)hist(column, by, grid, xlabelsize, xrot, …)
Make histogram.
df.hist()idxmax(axis, skipna, numeric_only)
Index of first maximum.
df.idxmax()idxmin(axis, skipna, numeric_only)
Index of first minimum.
df.idxmin()infer_objects(copy)
Infer better dtypes for object columns.
df.infer_objects()info(verbose, buf, max_cols, memory_usage, …)
Print concise summary.
df.info()insert(loc, column, value, allow_duplicates)
Insert column.
df.insert(0, 'new_col', values)interpolate(method, axis, limit, inplace, …)
Fill NaN using interpolation.
df.interpolate()
df.interpolate(method='linear')isetitem(loc, value)
Set value at position.
df.isetitem(0, value)isin(values)
Check if elements in values.
df.isin([1, 2, 3])isna()
Detect missing values.
df.isna()
df.isna().sum()isnull()
Alias for isna.
df.isnull()items()
Iterate over (column, Series) pairs.
for col, series in df.items():
print(col)iterrows()
Iterate over (index, Series) pairs.
for idx, row in df.iterrows():
print(idx, row)itertuples(index, name)
Iterate as namedtuples.
for row in df.itertuples():
print(row)join(other, on, how, lsuffix, rsuffix, …)
Join columns.
df.join(other_df)
df.join(other_df, how='outer')keys()
Get info axis.
df.keys() # alias for columnskurt(axis, skipna, numeric_only)
Unbiased kurtosis.
df.kurt()kurtosis(…)
Alias for kurt.
df.kurtosis()last(offset)
Select final periods.
df.last('5D')last_valid_index()
Index of last non-NA.
df.last_valid_index()le(other, axis, level)
Less than or equal.
df.le(5)lt(other, axis, level)
Less than.
df.lt(5)map(func, na_action)
Apply function elementwise.
df.map(lambda x: x * 2)mask(cond, other, inplace, axis, level)
Replace values where cond is True.
df.mask(df < 0, 0)max(axis, skipna, numeric_only)
Maximum.
df.max()mean(axis, skipna, numeric_only)
Mean.
df.mean()median(axis, skipna, numeric_only)
Median.
df.median()melt(id_vars, value_vars, var_name, …)
Unpivot DataFrame.
df.melt(id_vars=['id'])memory_usage(index, deep)
Memory usage in bytes.
df.memory_usage()
df.memory_usage(deep=True)merge(right, how, on, left_on, right_on, …)
Merge DataFrames.
df.merge(right, on='key')
df.merge(right, left_on='lkey', right_on='rkey')min(axis, skipna, numeric_only)
Minimum.
df.min()mod(other, axis, level, fill_value)
Modulo.
df.mod(2)mode(axis, numeric_only, dropna)
Mode (most frequent values).
df.mode()mul(other, axis, level, fill_value)
Multiplication.
df.mul(2)multiply(other, axis, level, fill_value)
Alias for mul.
df.multiply(2)ne(other, axis, level)
Not equal.
df.ne(5)nlargest(n, columns, keep)
Return first n rows ordered by columns.
df.nlargest(10, 'col')notna()
Detect existing values.
df.notna()notnull()
Alias for notna.
df.notnull()nsmallest(n, columns, keep)
Return first n rows ascending.
df.nsmallest(10, 'col')nunique(axis, dropna)
Count distinct elements.
df.nunique()pad()
Alias for ffill.
df.pad()pct_change(periods, fill_method, limit, freq)
Fractional change.
df.pct_change()pipe(func, *args, **kwargs)
Chainable functions.
df.pipe(func)
df.pipe(func, arg1, arg2)pivot(columns, index, values)
Reshape by columns.
df.pivot(index='idx', columns='col', values='val')pivot_table(values, index, columns, …)
Spreadsheet-style pivot.
df.pivot_table(values='val', index='idx', columns='col', aggfunc='sum')pop(item)
Return and drop column.
df.pop('col')pow(other, axis, level, fill_value)
Exponential power.
df.pow(2)prod(axis, skipna, numeric_only, min_count)
Product.
df.prod()product(…)
Alias for prod.
df.product()quantile(q, axis, numeric_only, …)
Values at given quantile.
df.quantile(0.5)
df.quantile([0.25, 0.75])query(expr, inplace)
Query with boolean expression.
df.query('a > b')
df.query('a > b', inplace=True)radd(other, axis, level, fill_value)
Reverse addition.
df.radd(5) # 5 + dfrank(axis, method, numeric_only, …)
Numerical ranks.
df.rank()
df.rank(method='min')rdiv(other, axis, level, fill_value)
Reverse division.
df.rdiv(1) # 1 / dfreindex(labels, index, columns, axis, …)
Conform to new index.
df.reindex(['a', 'b', 'c'])reindex_like(other, method, copy, limit, …)
Match other index.
df.reindex_like(other)rename(mapper, index, columns, axis, copy, …)
Rename labels.
df.rename(columns={'old': 'new'})
df.rename(index={0: 'a'})rename_axis(mapper, index, columns, axis, …)
Set axis name.
df.rename_axis('rows')reorder_levels(order, axis)
Reorder index levels.
df.reorder_levels([1, 0])replace(to_replace, value, inplace, limit, …)
Replace values.
df.replace(0, -1)
df.replace({'a': 1, 'b': 2})resample(rule, axis, closed, label, …)
Resample time series.
df.resample('D').sum()reset_index(level, drop, inplace, …)
Reset index.
df.reset_index()rfloordiv(other, axis, level, fill_value)
Reverse floor division.
df.rfloordiv(10) # 10 // dfrmod(other, axis, level, fill_value)
Reverse modulo.
df.rmod(10) # 10 % dfrmul(other, axis, level, fill_value)
Reverse multiplication.
df.rmul(2) # 2 * dfrolling(window, min_periods, center, …)
Rolling window calculations.
df.rolling(7).mean()round(decimals)
Round values.
df.round(2)rpow(other, axis, level, fill_value)
Reverse power.
df.rpow(2) # 2 ** dfrsub(other, axis, level, fill_value)
Reverse subtraction.
df.rsub(1) # 1 - dfrtruediv(other, axis, level, fill_value)
Reverse true division.
df.rtruediv(1) # 1 / dfsample(n, frac, replace, weights, …)
Random sample.
df.sample(5)
df.sample(frac=0.1)select_dtypes(include, exclude)
Select by dtype.
df.select_dtypes(include='number')
df.select_dtypes(exclude='object')sem(axis, skipna, ddof, numeric_only)
Standard error of mean.
df.sem()set_axis(labels, axis, copy)
Assign index.
df.set_axis(['a', 'b', 'c'])set_flags(copy, allows_duplicate_labels)
Update flags.
df.set_flags(allows_duplicate_labels=True)set_index(keys, drop, append, inplace, …)
Set index from columns.
df.set_index('col')shift(periods, freq, axis, fill_value, suffix)
Shift index.
df.shift(1) # shift forward
df.shift(-1) # shift backwardskew(axis, skipna, numeric_only)
Skewness.
df.skew()sort_index(axis, level, ascending, …)
Sort by index.
df.sort_index()sort_values(by, axis, ascending, …)
Sort by values.
df.sort_values('col')
df.sort_values(['col1', 'col2'])squeeze(axis)
Squeeze to scalar.
df.squeeze()stack(level, dropna, sort, future_stack)
Stack columns to index.
df.stack()std(axis, skipna, ddof, numeric_only)
Sample standard deviation.
df.std()sub(other, axis, level, fill_value)
Subtraction.
df.sub(1)subtract(other, axis, level, fill_value)
Alias for sub.
df.subtract(1)sum(axis, skipna, numeric_only, min_count)
Sum.
df.sum()swapaxes(axis1, axis2, copy)
Interchange axes.
df.swapaxes(0, 1)swaplevel(i, j, axis)
Swap levels.
df.swaplevel(0, 1)tail(n)
Return last n rows.
df.tail(10)take(indices, axis)
Select by position.
df.take([0, 2, 4])to_clipboard(excel, sep)
Copy to clipboard.
df.to_clipboard()to_csv(path_or_buf, sep, na_rep, …)
Write to CSV.
df.to_csv('file.csv')
df.to_csv()to_dict(orient, into, index)
Convert to dictionary.
df.to_dict()
df.to_dict('records')to_excel(excel_writer, sheet_name, …)
Write to Excel.
df.to_excel('file.xlsx')
df.to_excel('file.xlsx', sheet_name='sheet1')to_feather(path, **kwargs)
Write Feather format.
df.to_feather('file.feather')to_gbq(destination_table, project_id, …)
Write to BigQuery.
df.to_gbq('table', 'project')to_hdf(path_or_buf, key, mode, …)
Write to HDF5.
df.to_hdf('file.h5', key='df')to_html(buf, columns, col_space, header, …)
Render as HTML table.
df.to_html()to_json(path_or_buf, orient, date_format, …)
Convert to JSON.
df.to_json()
df.to_json(orient='records')to_latex(buf, columns, header, index, …)
Render to LaTeX.
df.to_latex()to_markdown(buf, mode, index, storage_options)
Print in Markdown.
df.to_markdown()to_numpy(dtype, copy, na_value)
Convert to numpy array.
df.to_numpy()to_orc(path, engine, index, engine_kwargs)
Write ORC format.
df.to_orc('file.orc')to_parquet(path, engine, compression, …)
Write Parquet format.
df.to_parquet('file.parquet')to_period(freq, axis, copy)
Convert to PeriodIndex.
df.to_period('D')to_pickle(path, compression, protocol, …)
Pickle to file.
df.to_pickle('file.pkl')to_records(index, column_dtypes, index_dtypes)
Convert to record array.
df.to_records()to_sql(name, con, schema, if_exists, …)
Write to SQL.
df.to_sql('table', con)to_stata(path, convert_dates, …)
Export to Stata.
df.to_stata('file.dta')to_string(buf, columns, col_space, header, …)
Console output.
df.to_string()to_timestamp(freq, how, axis, copy)
Cast to timestamps.
df.to_timestamp()to_xarray()
Convert to xarray.
df.to_xarray()to_xml(path_or_buffer, index, root_name, …)
Render to XML.
df.to_xml()transform(func, axis)
Transform columns.
df.transform(lambda x: x * 2)transpose(*args, copy)
Transpose.
df.T
df.transpose()truediv(other, axis, level, fill_value)
True division.
df.truediv(2)truncate(before, after, axis, copy)
Truncate before/after.
df.truncate(before='2020-01-01')tz_convert(tz, axis, level, copy)
Convert timezone.
df.tz_convert('UTC')tz_localize(tz, axis, level, copy, …)
Localize timezone.
df.tz_localize('UTC')unstack(level, fill_value, sort)
Pivot level to columns.
df.unstack()update(other, join, overwrite, …)
Update with other values.
df.update(other_df)value_counts(subset, normalize, sort, …)
Frequency of each row.
df.value_counts()var(axis, skipna, ddof, numeric_only)
Variance.
df.var()where(cond, other, inplace, axis, level)
Replace where False.
df.where(df > 0, -1)xs(key, axis, level, drop_level)
Cross-section.
df.xs('value', level='name')