Python

In [1]:


data.csv        
- no title specifiedProgramming language Designed by Appeared Extension
Python Guido van Rossum 1991 .py
Java James Gosling 1995 .java
C++ Bjarne Stroustrup 1983 .cpp
import pandas as pd
data = pd.read_csv('data.csv')
print(data)

  Programming language        Designed by  Appeared Extension
0               Python   Guido van Rossum      1991       .py
1                 Java      James Gosling      1995     .java
2                  C++  Bjarne Stroustrup      1983      .cpp

In [2]:

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Programming language  3 non-null      object
 1   Designed by           3 non-null      object
 2   Appeared              3 non-null      int64 
 3   Extension             3 non-null      object
dtypes: int64(1), object(3)
memory usage: 224.0+ bytes

In [3]:

print(data.columns)

Index(['Programming language', 'Designed by', 'Appeared', 'Extension'], dtype='object')

In [4]:

# Use DataFrame.T to transpose a dataframe.
print(data.T)

                                     0              1                  2
Programming language            Python           Java                C++
Designed by           Guido van Rossum  James Gosling  Bjarne Stroustrup
Appeared                          1991           1995               1983
Extension                          .py          .java               .cpp

In [5]:

# Use DataFrame.describe() to get summary statistics about data.
print(data.describe())

          Appeared
count     3.000000
mean   1989.666667
std       6.110101
min    1983.000000
25%    1987.000000
50%    1991.000000
75%    1993.000000
max    1995.000000

In [6]:

# Creating a Series by passing a list of values, letting pandas create a default integer index.

import numpy as np 
s = pd.Series([1, 3, 5, np.nan, 6, 8]); s

Out[6]:

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [7]:

# Creating a DataFrame by passing a NumPy array, with a datetime index and labeled columns. 
dates = pd.date_range("20130101", periods=6); dates

Out[7]:

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [8]:

df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD")); df

Out[8]:

	A	B	C	D
2013-01-01	-1.400367	0.485672	-0.497244	-1.528443
2013-01-02	-0.269064	-3.410692	0.725410	0.095149
2013-01-03	0.717987	-0.692142	-0.987139	0.131085
2013-01-04	1.092892	1.422995	0.109079	-0.703365
2013-01-05	1.860643	0.478394	0.064366	0.802319
2013-01-06	1.041973	-1.543057	-0.657560	-0.941966

In [9]:

# Creating a DataFrame by passing a dict of objects that can be converted to series-like.

df2 = pd.DataFrame(
     {
         "A": 1.0,
         "B": pd.Timestamp("20130102"),
         "C": pd.Series(1, index=list(range(4)), dtype="float32"),
         "D": np.array([3] * 4, dtype="int32"),
         "E": pd.Categorical(["test", "train", "test", "train"]),
         "F": "foo",
     }
)
df2

Out[9]:

	A	B	C	D	E	F
0	1.0	2013-01-02	1.0	3	test	foo
1	1.0	2013-01-02	1.0	3	train	foo
2	1.0	2013-01-02	1.0	3	test	foo
3	1.0	2013-01-02	1.0	3	train	foo

In [10]:

df2.dtypes

Out[10]:

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [11]:

# If you’re using IPython, tab completion for column names (as well as public attributes) is automatically enabled. Here’s a subset of the 
# attributes that will be completed:

df2.B # noqa: E225, E999

Out[11]:

0   2013-01-02
1   2013-01-02
2   2013-01-02
3   2013-01-02
Name: B, dtype: datetime64[ns]

In [12]:

# Here is how to view the top and bottom rows of the frame:
df.head()

Out[12]:

	A	B	C	D
2013-01-01	-1.400367	0.485672	-0.497244	-1.528443
2013-01-02	-0.269064	-3.410692	0.725410	0.095149
2013-01-03	0.717987	-0.692142	-0.987139	0.131085
2013-01-04	1.092892	1.422995	0.109079	-0.703365
2013-01-05	1.860643	0.478394	0.064366	0.802319

In [13]:

df.tail(3)

Out[13]:

	A	B	C	D
2013-01-04	1.092892	1.422995	0.109079	-0.703365
2013-01-05	1.860643	0.478394	0.064366	0.802319
2013-01-06	1.041973	-1.543057	-0.657560	-0.941966

In [14]:

# Display the index, columns
df.index

Out[14]:

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [15]:

df.columns

Out[15]:

Index(['A', 'B', 'C', 'D'], dtype='object')

In [16]:

# DataFrame.to_numpy() gives a NumPy representation of the underlying data. 
# Note that this can be an expensive operation when your DataFrame has columns with different data types, which comes down to a fundamental 
# difference between pandas and NumPy: NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column. 
# When you call DataFrame.to_numpy(), pandas will find the NumPy dtype that can hold all of the dtypes in the DataFrame. This may end up being
# 'object', which requires casting every value to a Python object.

# For df, our DataFrame of all floating-point values, DataFrame.to_numpy() is fast and doesn’t require copying data.
df.to_numpy()

Out[16]:

array([[-1.40036733,  0.48567217, -0.49724373, -1.52844315],
       [-0.26906424, -3.41069224,  0.72541009,  0.09514947],
       [ 0.71798702, -0.69214178, -0.98713943,  0.13108482],
       [ 1.09289196,  1.42299545,  0.10907936, -0.70336497],
       [ 1.86064349,  0.47839378,  0.06436552,  0.80231914],
       [ 1.04197281, -1.54305651, -0.65755971, -0.94196642]])

In [17]:

# For df2, the DataFrame with multiple dtypes, DataFrame.to_numpy() is relatively expensive.
df2.to_numpy()
# DataFrame.to_numpy() does not include the index or column labels in the output.

Out[17]:

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [18]:

# describe() shows a quick statistic summary of your data.
df.describe()

Out[18]:

	A	B	C	D
count	6.000000	6.000000	6.000000	6.000000
mean	0.507344	-0.543138	-0.207181	-0.357537
std	1.161607	1.744280	0.622703	0.851013
min	-1.400367	-3.410692	-0.987139	-1.528443
25%	-0.022301	-1.330328	-0.617481	-0.882316
50%	0.879980	-0.106874	-0.216439	-0.304108
75%	1.080162	0.483853	0.097901	0.122101
max	1.860643	1.422995	0.725410	0.802319

In [19]:

df.T

Out[19]:

	2013-01-01	2013-01-02	2013-01-03	2013-01-04	2013-01-05	2013-01-06
A	-1.400367	-0.269064	0.717987	1.092892	1.860643	1.041973
B	0.485672	-3.410692	-0.692142	1.422995	0.478394	-1.543057
C	-0.497244	0.725410	-0.987139	0.109079	0.064366	-0.657560
D	-1.528443	0.095149	0.131085	-0.703365	0.802319	-0.941966

In [20]:

# Sorting by an axis.

df.sort_index(axis=1, ascending=False)

Out[20]:

	D	C	B	A
2013-01-01	-1.528443	-0.497244	0.485672	-1.400367
2013-01-02	0.095149	0.725410	-3.410692	-0.269064
2013-01-03	0.131085	-0.987139	-0.692142	0.717987
2013-01-04	-0.703365	0.109079	1.422995	1.092892
2013-01-05	0.802319	0.064366	0.478394	1.860643
2013-01-06	-0.941966	-0.657560	-1.543057	1.041973

In [21]:

df.sort_values(by="B")

Out[21]:

	A	B	C	D
2013-01-02	-0.269064	-3.410692	0.725410	0.095149
2013-01-06	1.041973	-1.543057	-0.657560	-0.941966
2013-01-03	0.717987	-0.692142	-0.987139	0.131085
2013-01-05	1.860643	0.478394	0.064366	0.802319
2013-01-01	-1.400367	0.485672	-0.497244	-1.528443
2013-01-04	1.092892	1.422995	0.109079	-0.703365

In [22]:

# Selecting a single column, which yields a Series, equivalent to df.A:
df["A"]

Out[22]:

2013-01-01   -1.400367
2013-01-02   -0.269064
2013-01-03    0.717987
2013-01-04    1.092892
2013-01-05    1.860643
2013-01-06    1.041973
Freq: D, Name: A, dtype: float64

In [23]:

# Selecting via [], which slices the rows:
df[0:3]

Out[23]:

	A	B	C	D
2013-01-01	-1.400367	0.485672	-0.497244	-1.528443
2013-01-02	-0.269064	-3.410692	0.725410	0.095149
2013-01-03	0.717987	-0.692142	-0.987139	0.131085

In [24]:

df["20130102":"20130104"]

Out[24]:

	A	B	C	D
2013-01-02	-0.269064	-3.410692	0.725410	0.095149
2013-01-03	0.717987	-0.692142	-0.987139	0.131085
2013-01-04	1.092892	1.422995	0.109079	-0.703365

In [25]:

# For getting a cross section using a label:

df.loc[dates[0]]

Out[25]:

A   -1.400367
B    0.485672
C   -0.497244
D   -1.528443
Name: 2013-01-01 00:00:00, dtype: float64

In [26]:

df.loc[dates[0:3], ["A", "B"]]

Out[26]:

	A	B
2013-01-01	-1.400367	0.485672
2013-01-02	-0.269064	-3.410692
2013-01-03	0.717987	-0.692142

In [27]:

df.loc[:, ["A", "B"]]

Out[27]:

	A	B
2013-01-01	-1.400367	0.485672
2013-01-02	-0.269064	-3.410692
2013-01-03	0.717987	-0.692142
2013-01-04	1.092892	1.422995
2013-01-05	1.860643	0.478394
2013-01-06	1.041973	-1.543057

In [28]:

df[:3]

Out[28]:

	A	B	C	D
2013-01-01	-1.400367	0.485672	-0.497244	-1.528443
2013-01-02	-0.269064	-3.410692	0.725410	0.095149
2013-01-03	0.717987	-0.692142	-0.987139	0.131085

In [29]:

 df[::2]

Out[29]:

	A	B	C	D
2013-01-01	-1.400367	0.485672	-0.497244	-1.528443
2013-01-03	0.717987	-0.692142	-0.987139	0.131085
2013-01-05	1.860643	0.478394	0.064366	0.802319

In [30]:

 df.loc["20130104":"20130106", ["A", "B"]]

Out[30]:

	A	B
2013-01-04	1.092892	1.422995
2013-01-05	1.860643	0.478394
2013-01-06	1.041973	-1.543057

In [31]:

# .loc is strict when you present slicers that are not compatible (or convertible) with the index type. For example using integers
# in a DatetimeIndex. These will raise a TypeError:

df.loc[:3, ["A", "B"]]

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-31-2fa01b525b0c> in <module>
      2 # in a DatetimeIndex. These will raise a TypeError:
      3 
----> 4 df.loc[:3, ["A", "B"]]

~/anaconda3/envs/py/lib/python3.9/site-packages/pandas/core/indexing.py in __getitem__(self, key)
    887                     # AttributeError for IntervalTree get_value
    888                     return self.obj._get_value(*key, takeable=self._takeable)
--> 889             return self._getitem_tuple(key)
    890         else:
    891             # we by definition only have the 0th axis

~/anaconda3/envs/py/lib/python3.9/site-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
   1067             return self._multi_take(tup)
   1068 
-> 1069         return self._getitem_tuple_same_dim(tup)
   1070 
   1071     def _get_label(self, label, axis: int):

~/anaconda3/envs/py/lib/python3.9/site-packages/pandas/core/indexing.py in _getitem_tuple_same_dim(self, tup)
    773                 continue
    774 
--> 775             retval = getattr(retval, self.name)._getitem_axis(key, axis=i)
    776             # We should never have retval.ndim < self.ndim, as that should
    777             #  be handled by the _getitem_lowerdim call above.

~/anaconda3/envs/py/lib/python3.9/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1100         if isinstance(key, slice):
   1101             self._validate_key(key, axis)
-> 1102             return self._get_slice_axis(key, axis=axis)
   1103         elif com.is_bool_indexer(key):
   1104             return self._getbool_axis(key, axis=axis)

~/anaconda3/envs/py/lib/python3.9/site-packages/pandas/core/indexing.py in _get_slice_axis(self, slice_obj, axis)
   1134 
   1135         labels = obj._get_axis(axis)
-> 1136         indexer = labels.slice_indexer(
   1137             slice_obj.start, slice_obj.stop, slice_obj.step, kind="loc"
   1138         )

~/anaconda3/envs/py/lib/python3.9/site-packages/pandas/core/indexes/datetimes.py in slice_indexer(self, start, end, step, kind)
    782 
    783         try:
--> 784             return Index.slice_indexer(self, start, end, step, kind=kind)
    785         except KeyError:
    786             # For historical reasons DatetimeIndex by default supports

~/anaconda3/envs/py/lib/python3.9/site-packages/pandas/core/indexes/base.py in slice_indexer(self, start, end, step, kind)
   5275         slice(1, 3, None)
   5276         """
-> 5277         start_slice, end_slice = self.slice_locs(start, end, step=step, kind=kind)
   5278 
   5279         # return a slice

~/anaconda3/envs/py/lib/python3.9/site-packages/pandas/core/indexes/base.py in slice_locs(self, start, end, step, kind)
   5480         end_slice = None
   5481         if end is not None:
-> 5482             end_slice = self.get_slice_bound(end, "right", kind)
   5483         if end_slice is None:
   5484             end_slice = len(self)

~/anaconda3/envs/py/lib/python3.9/site-packages/pandas/core/indexes/base.py in get_slice_bound(self, label, side, kind)
   5384         # For datetime indices label may be a string that has to be converted
   5385         # to datetime boundary according to its resolution.
-> 5386         label = self._maybe_cast_slice_bound(label, side, kind)
   5387 
   5388         # we need to look up the label

~/anaconda3/envs/py/lib/python3.9/site-packages/pandas/core/indexes/datetimes.py in _maybe_cast_slice_bound(self, label, side, kind)
    738             self._deprecate_mismatched_indexing(label)
    739         else:
--> 740             raise self._invalid_indexer("slice", label)
    741 
    742         return self._maybe_cast_for_get_loc(label)

TypeError: cannot do slice indexing on DatetimeIndex with these indexers [3] of type int

In [33]:

df.loc["20130102", ["A", "B"]]

Out[33]:

A   -0.269064
B   -3.410692
Name: 2013-01-02 00:00:00, dtype: float64

In [34]:

# For getting a scalar value:

df.loc[dates[0], "A"]

Out[34]:

-1.4003673319172476

In [35]:

# For getting fast access to a scalar (equivalent to the prior method):

df.at[dates[0], "A"]

Out[35]:

-1.4003673319172476

In [36]:

# Select via the position of the passed integers:

df.iloc[3]

Out[36]:

A    1.092892
B    1.422995
C    0.109079
D   -0.703365
Name: 2013-01-04 00:00:00, dtype: float64

In [37]:

# By integer slices, acting similar to numpy/Python:
df.iloc[3:5, 0:2]

Out[37]:

	A	B
2013-01-04	1.092892	1.422995
2013-01-05	1.860643	0.478394

In [38]:

# By lists of integer position locations, similar to the NumPy/Python style:

df.iloc[[1, 2, 4], [0, 2]]

Out[38]:

	A	C
2013-01-02	-0.269064	0.725410
2013-01-03	0.717987	-0.987139
2013-01-05	1.860643	0.064366

In [39]:

# For slicing rows explicitly:

df.iloc[1:3, :]

Out[39]:

	A	B	C	D
2013-01-02	-0.269064	-3.410692	0.725410	0.095149
2013-01-03	0.717987	-0.692142	-0.987139	0.131085

In [40]:

# For slicing columns explicitly:

df.iloc[:, 1:3]

Out[40]:

	B	C
2013-01-01	0.485672	-0.497244
2013-01-02	-3.410692	0.725410
2013-01-03	-0.692142	-0.987139
2013-01-04	1.422995	0.109079
2013-01-05	0.478394	0.064366
2013-01-06	-1.543057	-0.657560

In [41]:

df.iloc[:3, [1, 2]]

Out[41]:

	B	C
2013-01-01	0.485672	-0.497244
2013-01-02	-3.410692	0.725410
2013-01-03	-0.692142	-0.987139

In [42]:

# For getting a value explicitly:

df.iloc[1, 1]

Out[42]:

-3.4106922431367956

In [43]:

# For getting fast access to a scalar (equivalent to the prior method):

df.iat[1, 1]

Out[43]:

-3.4106922431367956

In [44]:

# Boolean indexing
# Using a single column’s values to select data:

df[df["A"] > 0]

Out[44]:

	A	B	C	D
2013-01-03	0.717987	-0.692142	-0.987139	0.131085
2013-01-04	1.092892	1.422995	0.109079	-0.703365
2013-01-05	1.860643	0.478394	0.064366	0.802319
2013-01-06	1.041973	-1.543057	-0.657560	-0.941966

In [45]:

# Selecting values from a DataFrame where a boolean condition is met:
df[df > 0]

Out[45]:

	A	B	C	D
2013-01-01	NaN	0.485672	NaN	NaN
2013-01-02	NaN	NaN	0.725410	0.095149
2013-01-03	0.717987	NaN	NaN	0.131085
2013-01-04	1.092892	1.422995	0.109079	NaN
2013-01-05	1.860643	0.478394	0.064366	0.802319
2013-01-06	1.041973	NaN	NaN	NaN

In [63]:

# Using the isin() method for filtering:

df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three"]; df2

Out[63]:

	A	B	C	D	E
2013-01-01	-1.400367	0.485672	-0.497244	-1.528443	one
2013-01-02	-0.269064	-3.410692	0.725410	0.095149	one
2013-01-03	0.717987	-0.692142	-0.987139	0.131085	two
2013-01-04	1.092892	1.422995	0.109079	-0.703365	three
2013-01-05	1.860643	0.478394	0.064366	0.802319	four
2013-01-06	1.041973	-1.543057	-0.657560	-0.941966	three

In [64]:

df2[df2["E"].isin(["two", "four"])]

Out[64]:

	A	B	C	D	E
2013-01-03	0.717987	-0.692142	-0.987139	0.131085	two
2013-01-05	1.860643	0.478394	0.064366	0.802319	four

In [65]:

df2.isin(["two", "four"])

Out[65]:

	A	B	C	D	E
2013-01-01	False	False	False	False	False
2013-01-02	False	False	False	False	False
2013-01-03	False	False	False	False	True
2013-01-04	False	False	False	False	False
2013-01-05	False	False	False	False	True
2013-01-06	False	False	False	False	False

In [66]:

# Setting a new column automatically aligns the data by the indexes:

s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20130102", periods=6)); s1

Out[66]:

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [67]:

df2["F"] = s1; df2

Out[67]:

	A	B	C	D	E	F
2013-01-01	-1.400367	0.485672	-0.497244	-1.528443	one	NaN
2013-01-02	-0.269064	-3.410692	0.725410	0.095149	one	1.0
2013-01-03	0.717987	-0.692142	-0.987139	0.131085	two	2.0
2013-01-04	1.092892	1.422995	0.109079	-0.703365	three	3.0
2013-01-05	1.860643	0.478394	0.064366	0.802319	four	4.0
2013-01-06	1.041973	-1.543057	-0.657560	-0.941966	three	5.0

In [68]:

# Setting values by label:

df2.at[dates[0], "A"] = 0; df2

Out[68]:

	A	B	C	D	E	F
2013-01-01	0.000000	0.485672	-0.497244	-1.528443	one	NaN
2013-01-02	-0.269064	-3.410692	0.725410	0.095149	one	1.0
2013-01-03	0.717987	-0.692142	-0.987139	0.131085	two	2.0
2013-01-04	1.092892	1.422995	0.109079	-0.703365	three	3.0
2013-01-05	1.860643	0.478394	0.064366	0.802319	four	4.0
2013-01-06	1.041973	-1.543057	-0.657560	-0.941966	three	5.0

In [69]:

# Setting values by position:

df2.iat[0, 1] = 0; df2

Out[69]:

	A	B	C	D	E	F
2013-01-01	0.000000	0.000000	-0.497244	-1.528443	one	NaN
2013-01-02	-0.269064	-3.410692	0.725410	0.095149	one	1.0
2013-01-03	0.717987	-0.692142	-0.987139	0.131085	two	2.0
2013-01-04	1.092892	1.422995	0.109079	-0.703365	three	3.0
2013-01-05	1.860643	0.478394	0.064366	0.802319	four	4.0
2013-01-06	1.041973	-1.543057	-0.657560	-0.941966	three	5.0

In [70]:

# Setting by assigning with a NumPy array:

df2.loc[:, "D"] = np.array([5] * len(df2)); df2

Out[70]:

	A	B	C	D	E	F
2013-01-01	0.000000	0.000000	-0.497244	5	one	NaN
2013-01-02	-0.269064	-3.410692	0.725410	5	one	1.0
2013-01-03	0.717987	-0.692142	-0.987139	5	two	2.0
2013-01-04	1.092892	1.422995	0.109079	5	three	3.0
2013-01-05	1.860643	0.478394	0.064366	5	four	4.0
2013-01-06	1.041973	-1.543057	-0.657560	5	three	5.0

In [112]:

type(df2.loc[dates[0], "D"])

Out[112]:

numpy.int64

In [120]:

import numpy as np

val = np.int64(0)
pyval = val.item()
print(type(pyval)) 


# and similar...
# type(np.float64(0).item()) # <class 'float'>
# type(np.uint32(0).item())  # <class 'int'>
# type(np.int16(0).item())   # <class 'int'>
# type(np.cfloat(0).item())  # <class 'complex'>
# type(np.datetime64(0, 'D').item())  # <class 'datetime.date'>
# type(np.datetime64('2001-01-01 00:00:00').item())  # <class 'datetime.datetime'>
# type(np.timedelta64(0, 'D').item()) # <class 'datetime.timedelta'>
...

<class 'int'>

Out[120]:

"\n# and similar...\ntype(np.float64(0).item()) # <class 'float'>\ntype(np.uint32(0).item())  # <class 'int'>\ntype(np.int16(0).item())   # <class 'int'>\ntype(np.cfloat(0).item())  # <class 'complex'>\ntype(np.datetime64(0, 'D').item())  # <class 'datetime.date'>\ntype(np.datetime64('2001-01-01 00:00:00').item())  # <class 'datetime.datetime'>\ntype(np.timedelta64(0, 'D').item()) # <class 'datetime.timedelta'>\n...\n"

In [118]:

g = df2.loc[dates[0], "D"]
pynum = g.item()
type(pynum)

Out[118]:

int

In [71]:

df3 = df2.drop(columns=['E'])

In [72]:

df2 = df3; df2

Out[72]:

	A	B	C	D	F
2013-01-01	0.000000	0.000000	-0.497244	5	NaN
2013-01-02	-0.269064	-3.410692	0.725410	5	1.0
2013-01-03	0.717987	-0.692142	-0.987139	5	2.0
2013-01-04	1.092892	1.422995	0.109079	5	3.0
2013-01-05	1.860643	0.478394	0.064366	5	4.0
2013-01-06	1.041973	-1.543057	-0.657560	5	5.0

In [98]:

df3 = df2.copy()
df3[df3 > 0] = -df3
df3

Out[98]:

	A	B	C	D	F
2013-01-01	0.000000	0.000000	-0.497244	-5	NaN
2013-01-02	-0.269064	-3.410692	-0.725410	-5	-1.0
2013-01-03	-0.717987	-0.692142	-0.987139	-5	-2.0
2013-01-04	-1.092892	-1.422995	-0.109079	-5	-3.0
2013-01-05	-1.860643	-0.478394	-0.064366	-5	-4.0
2013-01-06	-1.041973	-1.543057	-0.657560	-5	-5.0

In [99]:

# Missing data:
# pandas primarily uses the value np.nan to represent missing data. It is by default not included in computations. See the 
# Missing Data section.
# Reindexing allows you to change/add/delete the index on a specified axis. This returns a copy of the data.

df3 = df2.reindex(index=dates[0:4], columns=list(df2.columns) + ["E"])
df3.loc[dates[0] : dates[1], "E"] = 1
df3

Out[99]:

	A	B	C	D	F	E
2013-01-01	0.000000	0.000000	-0.497244	5	NaN	1.0
2013-01-02	-0.269064	-3.410692	0.725410	5	1.0	1.0
2013-01-03	0.717987	-0.692142	-0.987139	5	2.0	NaN
2013-01-04	1.092892	1.422995	0.109079	5	3.0	NaN

In [100]:

# To drop any rows that have missing data:

df3.dropna(how="any")

Out[100]:

	A	B	C	D	F	E
2013-01-02	-0.269064	-3.410692	0.72541	5	1.0	1.0

In [101]:

# Filling missing data:

df3.fillna(value=5)

Out[101]:

	A	B	C	D	F	E
2013-01-01	0.000000	0.000000	-0.497244	5	5.0	1.0
2013-01-02	-0.269064	-3.410692	0.725410	5	1.0	1.0
2013-01-03	0.717987	-0.692142	-0.987139	5	2.0	5.0
2013-01-04	1.092892	1.422995	0.109079	5	3.0	5.0

In [102]:

# To get the boolean mask where values are nan:

pd.isna(df3)

Out[102]:

	A	B	C	D	F	E
2013-01-01	False	False	False	False	True	False
2013-01-02	False	False	False	False	False	False
2013-01-03	False	False	False	False	False	True
2013-01-04	False	False	False	False	False	True

In [103]:

# stats:

df3.mean()

Out[103]:

A    0.385454
B   -0.669960
C   -0.162473
D    5.000000
F    2.000000
E    1.000000
dtype: float64

In [104]:

# Same operation on the other axis:

df3.mean(1)

Out[104]:

2013-01-01    1.100551
2013-01-02    0.674276
2013-01-03    1.207741
2013-01-04    2.124993
Freq: D, dtype: float64

In [105]:

# Operating with objects that have different dimensionality and need alignment. In addition, pandas automatically broadcasts 
# along the specified dimension.

s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2); s

Out[105]:

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [106]:

df3.sub(s, axis="index")

Out[106]:

	A	B	C	D	F	E
2013-01-01	NaN	NaN	NaN	NaN	NaN	NaN
2013-01-02	NaN	NaN	NaN	NaN	NaN	NaN
2013-01-03	-0.282013	-1.692142	-1.987139	4.0	1.0	NaN
2013-01-04	-1.907108	-1.577005	-2.890921	2.0	0.0	NaN
2013-01-05	NaN	NaN	NaN	NaN	NaN	NaN
2013-01-06	NaN	NaN	NaN	NaN	NaN	NaN

In [131]:

dfx = pd.DataFrame({'angles': [0, 3, 4], 'degrees': [360, 180, 360]}, index=['circle', 'triangle', 'rectangle']); dfx

Out[131]:

	angles	degrees
circle	0	360
triangle	3	180
rectangle	4	360

In [132]:

dfx + 1

Out[132]:

	angles	degrees
circle	1	361
triangle	4	181
rectangle	5	361

In [133]:

dfx.add(1)

Out[133]:

	angles	degrees
circle	1	361
triangle	4	181
rectangle	5	361

In [134]:

dfx.div(10)

Out[134]:

	angles	degrees
circle	0.0	36.0
triangle	0.3	18.0
rectangle	0.4	36.0

In [135]:

dfx.rdiv(10)

Out[135]:

	angles	degrees
circle	inf	0.027778
triangle	3.333333	0.055556
rectangle	2.500000	0.027778

In [136]:

dfx - [1, 2]

Out[136]:

	angles	degrees
circle	-1	358
triangle	2	178
rectangle	3	358

In [137]:

dfx.sub([1, 2], axis='columns')

Out[137]:

	angles	degrees
circle	-1	358
triangle	2	178
rectangle	3	358

In [139]:

dfx.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']), axis='index')

Out[139]:

	angles	degrees
circle	-1	359
triangle	2	179
rectangle	3	359

In [144]:

# Multiply a DataFrame of different shape with operator version:

other = pd.DataFrame({'angles': [0, 3, 4]}, index=['circle', 'triangle', 'rectangle']);other

Out[144]:

	angles
circle	0
triangle	3
rectangle	4

In [153]:

dfx * other

Out[153]:

	angles	degrees
circle	0	NaN
triangle	9	NaN
rectangle	16	NaN

In [146]:

another = pd.DataFrame({'degrees': [0, 3, 4]}, index=['circle', 'triangle', 'rectangle']);another

Out[146]:

	degrees
circle	0
triangle	3
rectangle	4

In [154]:

dfx * other

Out[154]:

	angles	degrees
circle	0	NaN
triangle	9	NaN
rectangle	16	NaN

In [155]:

dfx.mul(other, fill_value=0)

Out[155]:

	angles	degrees
circle	0	0.0
triangle	9	0.0
rectangle	16	0.0

In [160]:

# Divide by a MultiIndex by level:

dfx_multindex = pd.DataFrame({'angles': [0, 3, 4, 4, 5, 6], 'degrees': [360, 180, 360, 360, 540, 720]}, index=[['A', 'A', 'A', 'B', 'B', 'B'], ['circle', 'triangle', 'rectangle',
'square', 'pentagon', 'hexagon']]); dfx_multindex

Out[160]:

		angles	degrees
A	circle	0	360
	triangle	3	180
	rectangle	4	360
B	square	4	360
	pentagon	5	540
	hexagon	6	720

In [159]:

dfx.div(dfx_multindex, level=1, fill_value=0)

Out[159]:

		angles	degrees
A	circle	NaN	1.0
	triangle	1.0	1.0
	rectangle	1.0	1.0
B	square	0.0	0.0
	pentagon	0.0	0.0
	hexagon	0.0	0.0

In [163]:

# Apply: applying functions to the data.

dfx.apply(np.cumsum)

Out[163]:

	angles	degrees
circle	0	360
triangle	3	540
rectangle	7	900

In [165]:

dfx.apply(lambda x: x.max() - x.min())

Out[165]:

angles       4
degrees    180
dtype: int64

In [166]:

s = pd.Series(np.random.randint(0, 7, size=10)); s

Out[166]:

0    2
1    2
2    6
3    6
4    2
5    6
6    5
7    4
8    3
9    3
dtype: int64

In [167]:

s.value_counts()

Out[167]:

2    3
6    3
3    2
4    1
5    1
dtype: int64

In [168]:

# String Methods
# Series is equipped with a set of string processing methods in the str attribute that make it easy to operate on each element of
# the array, as in the code snippet below. 
# Note that pattern-matching in str generally uses regular expressions by default (and in some cases always uses them). 
# See more at Vectorized String Methods.

s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])
s.str.lower()

Out[168]:

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

In [173]:

# Merge
# Concat: pandas provides various facilities for easily combining together Series and DataFrame objects with various kinds of set
# logic for the indexes and relational algebra functionality in the case of join / merge-type operations. See the Merging section.

# Concatenating pandas objects together with concat():

dfr = pd.DataFrame(np.random.randn(10, 4)); dfr

Out[173]:

	0	1	2	3
0	1.792219	-1.270884	-0.754388	-0.678194
1	0.038137	0.125599	0.197699	2.070913
2	-0.544621	-0.912764	0.568775	0.122322
3	-0.684584	-1.113660	-0.165820	0.213725
4	-0.481085	-0.886405	1.435884	-0.640454
5	-0.541138	1.056460	-2.093542	-0.605639
6	0.600976	2.028510	-0.745702	1.802568
7	-0.134494	-1.094256	0.604732	0.675409
8	1.874577	-0.695563	-1.423242	-2.109155
9	0.951339	-0.430272	-1.204767	0.694151

In [174]:

pieces = [dfr[:3], dfr[3:7], dfr[7:]]
pd.concat(pieces)

# Adding a column to a DataFrame is relatively fast. However, adding a row requires a copy, and may be expensive. We recommend 
# passing a pre-built list of records to the DataFrame constructor instead of building a DataFrame by iteratively appending 
# records to it. See Appending to dataframe for more.

Out[174]:

	0	1	2	3
0	1.792219	-1.270884	-0.754388	-0.678194
1	0.038137	0.125599	0.197699	2.070913
2	-0.544621	-0.912764	0.568775	0.122322
3	-0.684584	-1.113660	-0.165820	0.213725
4	-0.481085	-0.886405	1.435884	-0.640454
5	-0.541138	1.056460	-2.093542	-0.605639
6	0.600976	2.028510	-0.745702	1.802568
7	-0.134494	-1.094256	0.604732	0.675409
8	1.874577	-0.695563	-1.423242	-2.109155
9	0.951339	-0.430272	-1.204767	0.694151

In [178]:

 result = dfx.append(dfr); result

Out[178]:

	angles	degrees	0	1	2	3
circle	0.0	360.0	NaN	NaN	NaN	NaN
triangle	3.0	180.0	NaN	NaN	NaN	NaN
rectangle	4.0	360.0	NaN	NaN	NaN	NaN
0	NaN	NaN	1.792219	-1.270884	-0.754388	-0.678194
1	NaN	NaN	0.038137	0.125599	0.197699	2.070913
2	NaN	NaN	-0.544621	-0.912764	0.568775	0.122322
3	NaN	NaN	-0.684584	-1.113660	-0.165820	0.213725
4	NaN	NaN	-0.481085	-0.886405	1.435884	-0.640454
5	NaN	NaN	-0.541138	1.056460	-2.093542	-0.605639
6	NaN	NaN	0.600976	2.028510	-0.745702	1.802568
7	NaN	NaN	-0.134494	-1.094256	0.604732	0.675409
8	NaN	NaN	1.874577	-0.695563	-1.423242	-2.109155
9	NaN	NaN	0.951339	-0.430272	-1.204767	0.694151

In [179]:

result2 = pd.concat([dfx, dfr], ignore_index=True, sort=False); result2

Out[179]:

	angles	degrees	0	1	2	3
0	0.0	360.0	NaN	NaN	NaN	NaN
1	3.0	180.0	NaN	NaN	NaN	NaN
2	4.0	360.0	NaN	NaN	NaN	NaN
3	NaN	NaN	1.792219	-1.270884	-0.754388	-0.678194
4	NaN	NaN	0.038137	0.125599	0.197699	2.070913
5	NaN	NaN	-0.544621	-0.912764	0.568775	0.122322
6	NaN	NaN	-0.684584	-1.113660	-0.165820	0.213725
7	NaN	NaN	-0.481085	-0.886405	1.435884	-0.640454
8	NaN	NaN	-0.541138	1.056460	-2.093542	-0.605639
9	NaN	NaN	0.600976	2.028510	-0.745702	1.802568
10	NaN	NaN	-0.134494	-1.094256	0.604732	0.675409
11	NaN	NaN	1.874577	-0.695563	-1.423242	-2.109155
12	NaN	NaN	0.951339	-0.430272	-1.204767	0.694151

In [182]:

# Join: SQL style merges. See the Database style joining section.

left = pd.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]})
right = pd.DataFrame({"key": ["foo", "foo"], "rval": [4, 5]})
left

Out[182]:

	key	lval
0	foo	1
1	foo	2

In [183]:

right

Out[183]:

	key	rval
0	foo	4
1	foo	5

In [186]:

pd.merge(left, right, on="key")

Out[186]:

	key	lval	rval
0	foo	1	4
1	foo	1	5
2	foo	2	4
3	foo	2	5

In [157]:


hrdata.csv
- no title specifiedName Hire Date Salary Sick Days remaining
Graham Chapman 03/15/14 50000 10
John Cleese 06/01/15 65000 8
Eric Idle 05/12/14 45000 10
Terry Jones 11/01/13 70000 3
Terry Gilliam 08/12/14 48000 7
Michael Palin 05/23/13 66000 8
import pandas
dfh = pandas.read_csv('hrdata.csv')
print(df)

# pandas.read_csv() opens, analyzes, and reads the CSV file provided, and stores the data in a DataFrame.

           angles  degrees
circle          0      360
triangle        3      180
rectangle       4      360

In [ ]:

# However, pandas is also using zero-based integer indices in the DataFrame. That’s because we didn’t tell it what our index should be. To use 
# a different column as the DataFrame index, add the index_col optional parameter:

import pandas
df2 = pandas.read_csv('hrdata.csv', index_col='Name')
print(df2)

In [ ]:

# Further, if you look at the data types of our columns , you’ll see pandas has properly converted the Salary and Sick Days remaining columns 
# to numbers, but the Hire Date column is still a String. This is easily confirmed in interactive mode:

print(type(df2['Hire Date'][0]))

In [ ]:

# You can force pandas to read data as a date with the parse_dates optional parameter, which is defined as a list of column names
# to treat as dates:

import pandas
df2 = pandas.read_csv('hrdata.csv', index_col='Name', parse_dates=['Hire Date'])
print(df2)

In [ ]:

print(type(df['Hire Date'][0]))

In [ ]:

# If your CSV files doesn’t have column names in the first line, you can use the names optional parameter to provide a list of column names. 
# You can also use this if you want to override the column names provided in the first line. 
# In this case, you must also tell pandas.read_csv() to ignore existing column names using the header=0 optional parameter:

import pandas
df2 = pandas.read_csv('hrdata.csv', 
            index_col='Employee', 
            parse_dates=['Hired'], 
            header=0, 
            names=['Employee', 'Hired','Salary', 'Sick Days'])
print(df2)

In [ ]:


hrdata_modified.csv
- no title specifiedEmployee Hired Salary Sick Days
Graham Chapman 2014-03-15 50000 10
John Cleese 2015-06-01 65000 8
Eric Idle 2014-05-12 45000 10
Terry Jones 2013-11-01 70000 3
Terry Gilliam 2014-08-12 48000 7
Michael Palin 2013-05-23 66000 8
# Writing CSV files with pandas:

import pandas
df2 = pandas.read_csv('hrdata.csv', 
            index_col='Employee', 
            parse_dates=['Hired'],
            header=0, 
            names=['Employee', 'Hired', 'Salary', 'Sick Days'])
df2.to_csv('hrdata_modified.csv')
print(df2)

In [ ]: