Manipulation Selection#

Ref:

https://docs.pola.rs/api/python/stable/reference/dataframe/modify_select.html

[2]:

import polars as pl
import faker

fake = faker.Faker()

n_row = 50
data = {
    "id": list(range(1, 1+n_row)),
    "name": [fake.name() for _ in range(n_row)],
    "phone": [fake.phone_number() for _ in range(n_row)],
}
df = pl.DataFrame(data)
df

[2]:

shape: (50, 3)

id	name	phone
i64	str	str
1	"Maria Powell"	"836.602.8286x06763"
2	"Jacqueline Fletcher PhD"	"+1-801-639-6835"
3	"Mary Wilson"	"(999)905-1935"
4	"Kenneth Davis"	"332-275-7129x0289"
5	"Christopher Martin"	"(792)727-8878x82819"
…	…	…
46	"William Keller"	"(695)744-1587x0883"
47	"Vickie Perez"	"+1-297-963-9194x6132"
48	"William Howard"	"630.391.0772x26252"
49	"Laura Nash"	"(495)680-1361x6763"
50	"Donald Johnson"	"(282)973-8109x024"

Select by Rows#

Take First N#

[3]:

df.head(5)

[3]:

shape: (5, 3)

id	name	phone
i64	str	str
1	"Maria Powell"	"836.602.8286x06763"
2	"Jacqueline Fletcher PhD"	"+1-801-639-6835"
3	"Mary Wilson"	"(999)905-1935"
4	"Kenneth Davis"	"332-275-7129x0289"
5	"Christopher Martin"	"(792)727-8878x82819"

Take Last N#

[4]:

df.tail(5)

[4]:

shape: (5, 3)

id	name	phone
i64	str	str
46	"William Keller"	"(695)744-1587x0883"
47	"Vickie Perez"	"+1-297-963-9194x6132"
48	"William Howard"	"630.391.0772x26252"
49	"Laura Nash"	"(495)680-1361x6763"
50	"Donald Johnson"	"(282)973-8109x024"

Select By Index Range#

Note: Polars does not have a multi-index/index, you have to use integer index to select rows.

[5]:

# Method 1
df.slice(offset=5, length=10)

[5]:

shape: (10, 3)

id	name	phone
i64	str	str
6	"Patricia Brown"	"363-719-4855x7138"
7	"Richard Hodges"	"(831)255-4407x13496"
8	"Steve Green"	"671-700-3796x127"
9	"Luis Smith"	"844.513.7054x4915"
10	"Morgan Hensley"	"001-741-524-3690x958"
11	"Tanya Peck"	"819-449-4406"
12	"Mr. Joseph Parrish"	"409-954-9495x85424"
13	"Juan Frazier"	"282.948.5514x337"
14	"David Murray"	"902-780-8331x8359"
15	"Ryan Campbell"	"7713098760"

[6]:

# Method 2
start_idx = 6
end_idx = 15
df.slice(offset=start_idx-1, length=(end_idx-start_idx+1))

[6]:

shape: (10, 3)

id	name	phone
i64	str	str
6	"Patricia Brown"	"363-719-4855x7138"
7	"Richard Hodges"	"(831)255-4407x13496"
8	"Steve Green"	"671-700-3796x127"
9	"Luis Smith"	"844.513.7054x4915"
10	"Morgan Hensley"	"001-741-524-3690x958"
11	"Tanya Peck"	"819-449-4406"
12	"Mr. Joseph Parrish"	"409-954-9495x85424"
13	"Juan Frazier"	"282.948.5514x337"
14	"David Murray"	"902-780-8331x8359"
15	"Ryan Campbell"	"7713098760"

Select One Row By Index#

[7]:

# Don't do df[5], it will return a dataframe
df.row(5)

[7]:

(6, 'Patricia Brown', '363-719-4855x7138')

[8]:

df.row(5, named=True)

[8]:

{'id': 6, 'name': 'Patricia Brown', 'phone': '363-719-4855x7138'}

Select Rows by Multiple Index#

[9]:

df[[1, 3, 5]]

[9]:

shape: (3, 3)

id	name	phone
i64	str	str
2	"Jacqueline Fletcher PhD"	"+1-801-639-6835"
4	"Kenneth Davis"	"332-275-7129x0289"
6	"Patricia Brown"	"363-719-4855x7138"

Randomly Sample By Rows#

[10]:

df.sample(n=5)

[10]:

shape: (5, 3)

id	name	phone
i64	str	str
24	"Erica Keller"	"864.464.0633"
46	"William Keller"	"(695)744-1587x0883"
18	"James Miller"	"369.515.1819x923"
21	"Daniel Walker MD"	"+1-832-758-6101x9324"
12	"Mr. Joseph Parrish"	"409-954-9495x85424"

[11]:

df.sample(fraction=0.1)

[11]:

shape: (5, 3)

id	name	phone
i64	str	str
30	"Sarah Pittman"	"6854545835"
11	"Tanya Peck"	"819-449-4406"
8	"Steve Green"	"671-700-3796x127"
17	"Andrew Lee"	"754-587-2094"
40	"Francisco Gonzalez"	"001-427-821-4543x436"

[12]:

df_res = df.sample(fraction=0.5, with_replacement=True)
df_res

[12]:

shape: (25, 3)

id	name	phone
i64	str	str
14	"David Murray"	"902-780-8331x8359"
12	"Mr. Joseph Parrish"	"409-954-9495x85424"
24	"Erica Keller"	"864.464.0633"
20	"Lindsay Meza"	"466-888-0910x674"
35	"Patricia Simmons"	"(816)705-8827x9632"
…	…	…
24	"Erica Keller"	"864.464.0633"
24	"Erica Keller"	"864.464.0633"
10	"Morgan Hensley"	"001-741-524-3690x958"
42	"Tina Allen"	"669.671.7571"
50	"Donald Johnson"	"(282)973-8109x024"

[13]:

# most likely it won't be n_row / 2
df_res.n_unique()

[13]:

Select By Columns#

Select one Column#

[14]:

df.select("name")

[14]:

shape: (50, 1)

name
str
"Maria Powell"
"Jacqueline Fletcher PhD"
"Mary Wilson"
"Kenneth Davis"
"Christopher Martin"
…
"William Keller"
"Vickie Perez"
"William Howard"
"Laura Nash"
"Donald Johnson"

Select multiple Column#

[15]:

df.select(["id", "name"])

[15]:

shape: (50, 2)

id	name
i64	str
1	"Maria Powell"
2	"Jacqueline Fletcher PhD"
3	"Mary Wilson"
4	"Kenneth Davis"
5	"Christopher Martin"
…	…
46	"William Keller"
47	"Vickie Perez"
48	"William Howard"
49	"Laura Nash"
50	"Donald Johnson"

[16]:

columns = list(df.schema)
df.select(columns[:2])

[16]:

shape: (50, 2)

id	name
i64	str
1	"Maria Powell"
2	"Jacqueline Fletcher PhD"
3	"Mary Wilson"
4	"Kenneth Davis"
5	"Christopher Martin"
…	…
46	"William Keller"
47	"Vickie Perez"
48	"William Howard"
49	"Laura Nash"
50	"Donald Johnson"

[17]:

df.select(columns[-2:])

[17]:

shape: (50, 2)

name	phone
str	str
"Maria Powell"	"836.602.8286x06763"
"Jacqueline Fletcher PhD"	"+1-801-639-6835"
"Mary Wilson"	"(999)905-1935"
"Kenneth Davis"	"332-275-7129x0289"
"Christopher Martin"	"(792)727-8878x82819"
…	…
"William Keller"	"(695)744-1587x0883"
"Vickie Perez"	"+1-297-963-9194x6132"
"William Howard"	"630.391.0772x26252"
"Laura Nash"	"(495)680-1361x6763"
"Donald Johnson"	"(282)973-8109x024"

Select By Both Rows and Columns#

[18]:

df.select(["id", "name"])[[1, 3, 5]]

[18]:

shape: (3, 2)

id	name
i64	str
2	"Jacqueline Fletcher PhD"
4	"Kenneth Davis"
6	"Patricia Brown"

Select a Specific Cell#

[19]:

df.item(row=0, column="id")

[19]:

[20]:

df.item(row=0, column=0)

[20]:

Iterate Over Columns#

[21]:

for series in df.iter_columns():
    print(f"--- {series.name = }")
    print(f"{type(series) = }")
    print(f"{series = }")

--- series.name = 'id'
type(series) = <class 'polars.series.series.Series'>
series = shape: (50,)
Series: 'id' [i64]
[
        1
        2
        3
        4
        5
        …
        46
        47
        48
        49
        50
]
--- series.name = 'name'
type(series) = <class 'polars.series.series.Series'>
series = shape: (50,)
Series: 'name' [str]
[
        "Maria Powell"
        "Jacqueline Fletcher PhD"
        "Mary Wilson"
        "Kenneth Davis"
        "Christopher Martin"
        …
        "William Keller"
        "Vickie Perez"
        "William Howard"
        "Laura Nash"
        "Donald Johnson"
]
--- series.name = 'phone'
type(series) = <class 'polars.series.series.Series'>
series = shape: (50,)
Series: 'phone' [str]
[
        "836.602.8286x06763"
        "+1-801-639-6835"
        "(999)905-1935"
        "332-275-7129x0289"
        "(792)727-8878x82819"
        …
        "(695)744-1587x0883"
        "+1-297-963-9194x6132"
        "630.391.0772x26252"
        "(495)680-1361x6763"
        "(282)973-8109x024"
]

Iterate Over Rows#

[22]:

for row in df.iter_rows():
    print(f"--- {row[0] = }")
    print(f"{type(row) = }")
    print(f"{row = }")

--- row[0] = 1
type(row) = <class 'tuple'>
row = (1, 'Maria Powell', '836.602.8286x06763')
--- row[0] = 2
type(row) = <class 'tuple'>
row = (2, 'Jacqueline Fletcher PhD', '+1-801-639-6835')
--- row[0] = 3
type(row) = <class 'tuple'>
row = (3, 'Mary Wilson', '(999)905-1935')
--- row[0] = 4
type(row) = <class 'tuple'>
row = (4, 'Kenneth Davis', '332-275-7129x0289')
--- row[0] = 5
type(row) = <class 'tuple'>
row = (5, 'Christopher Martin', '(792)727-8878x82819')
--- row[0] = 6
type(row) = <class 'tuple'>
row = (6, 'Patricia Brown', '363-719-4855x7138')
--- row[0] = 7
type(row) = <class 'tuple'>
row = (7, 'Richard Hodges', '(831)255-4407x13496')
--- row[0] = 8
type(row) = <class 'tuple'>
row = (8, 'Steve Green', '671-700-3796x127')
--- row[0] = 9
type(row) = <class 'tuple'>
row = (9, 'Luis Smith', '844.513.7054x4915')
--- row[0] = 10
type(row) = <class 'tuple'>
row = (10, 'Morgan Hensley', '001-741-524-3690x958')
--- row[0] = 11
type(row) = <class 'tuple'>
row = (11, 'Tanya Peck', '819-449-4406')
--- row[0] = 12
type(row) = <class 'tuple'>
row = (12, 'Mr. Joseph Parrish', '409-954-9495x85424')
--- row[0] = 13
type(row) = <class 'tuple'>
row = (13, 'Juan Frazier', '282.948.5514x337')
--- row[0] = 14
type(row) = <class 'tuple'>
row = (14, 'David Murray', '902-780-8331x8359')
--- row[0] = 15
type(row) = <class 'tuple'>
row = (15, 'Ryan Campbell', '7713098760')
--- row[0] = 16
type(row) = <class 'tuple'>
row = (16, 'Debra Harris', '984.923.3706')
--- row[0] = 17
type(row) = <class 'tuple'>
row = (17, 'Andrew Lee', '754-587-2094')
--- row[0] = 18
type(row) = <class 'tuple'>
row = (18, 'James Miller', '369.515.1819x923')
--- row[0] = 19
type(row) = <class 'tuple'>
row = (19, 'Emma Gentry', '001-323-490-9924x59516')
--- row[0] = 20
type(row) = <class 'tuple'>
row = (20, 'Lindsay Meza', '466-888-0910x674')
--- row[0] = 21
type(row) = <class 'tuple'>
row = (21, 'Daniel Walker MD', '+1-832-758-6101x9324')
--- row[0] = 22
type(row) = <class 'tuple'>
row = (22, 'Jon Howard', '666-699-5756x1435')
--- row[0] = 23
type(row) = <class 'tuple'>
row = (23, 'Shannon Johnson', '8148458256')
--- row[0] = 24
type(row) = <class 'tuple'>
row = (24, 'Erica Keller', '864.464.0633')
--- row[0] = 25
type(row) = <class 'tuple'>
row = (25, 'Betty Perez', '869.484.7373')
--- row[0] = 26
type(row) = <class 'tuple'>
row = (26, 'John Wiley Jr.', '882-603-1099x880')
--- row[0] = 27
type(row) = <class 'tuple'>
row = (27, 'Samantha Gutierrez', '(688)944-5982x705')
--- row[0] = 28
type(row) = <class 'tuple'>
row = (28, 'Matthew Tucker', '+1-577-433-9373x084')
--- row[0] = 29
type(row) = <class 'tuple'>
row = (29, 'Jennifer Black', '(832)819-8567')
--- row[0] = 30
type(row) = <class 'tuple'>
row = (30, 'Sarah Pittman', '6854545835')
--- row[0] = 31
type(row) = <class 'tuple'>
row = (31, 'Dennis Vaughan', '6862893476')
--- row[0] = 32
type(row) = <class 'tuple'>
row = (32, 'Kevin Martinez', '(419)553-1369')
--- row[0] = 33
type(row) = <class 'tuple'>
row = (33, 'Victor Norman', '303-258-9000x83809')
--- row[0] = 34
type(row) = <class 'tuple'>
row = (34, 'Daniel Murray', '675-839-2466')
--- row[0] = 35
type(row) = <class 'tuple'>
row = (35, 'Patricia Simmons', '(816)705-8827x9632')
--- row[0] = 36
type(row) = <class 'tuple'>
row = (36, 'Emma Gomez', '653.227.9975')
--- row[0] = 37
type(row) = <class 'tuple'>
row = (37, 'Charles Robinson', '+1-687-438-5234')
--- row[0] = 38
type(row) = <class 'tuple'>
row = (38, 'Anthony Smith', '+1-323-237-2932')
--- row[0] = 39
type(row) = <class 'tuple'>
row = (39, 'Steven Bennett', '6106982086')
--- row[0] = 40
type(row) = <class 'tuple'>
row = (40, 'Francisco Gonzalez', '001-427-821-4543x436')
--- row[0] = 41
type(row) = <class 'tuple'>
row = (41, 'Stephanie Shaffer', '001-999-779-0085x959')
--- row[0] = 42
type(row) = <class 'tuple'>
row = (42, 'Tina Allen', '669.671.7571')
--- row[0] = 43
type(row) = <class 'tuple'>
row = (43, 'Matthew Harrell', '678.831.5948')
--- row[0] = 44
type(row) = <class 'tuple'>
row = (44, 'John Hill', '355.476.5055')
--- row[0] = 45
type(row) = <class 'tuple'>
row = (45, 'James Shea', '+1-903-530-8220x480')
--- row[0] = 46
type(row) = <class 'tuple'>
row = (46, 'William Keller', '(695)744-1587x0883')
--- row[0] = 47
type(row) = <class 'tuple'>
row = (47, 'Vickie Perez', '+1-297-963-9194x6132')
--- row[0] = 48
type(row) = <class 'tuple'>
row = (48, 'William Howard', '630.391.0772x26252')
--- row[0] = 49
type(row) = <class 'tuple'>
row = (49, 'Laura Nash', '(495)680-1361x6763')
--- row[0] = 50
type(row) = <class 'tuple'>
row = (50, 'Donald Johnson', '(282)973-8109x024')

Iterate Over Slices#

Sub DataFrames with a fewer rows

[23]:

# When total number of row is multiplier of ``n_rows``
for ith_df, sub_df in enumerate(df.iter_slices(n_rows=n_row // 5), start=1):
    print(f"--- {ith_df = }")
    print(sub_df)

--- ith_df = 1
shape: (10, 3)
┌─────┬─────────────────────────┬──────────────────────┐
│ id  ┆ name                    ┆ phone                │
│ --- ┆ ---                     ┆ ---                  │
│ i64 ┆ str                     ┆ str                  │
╞═════╪═════════════════════════╪══════════════════════╡
│ 1   ┆ Maria Powell            ┆ 836.602.8286x06763   │
│ 2   ┆ Jacqueline Fletcher PhD ┆ +1-801-639-6835      │
│ 3   ┆ Mary Wilson             ┆ (999)905-1935        │
│ 4   ┆ Kenneth Davis           ┆ 332-275-7129x0289    │
│ 5   ┆ Christopher Martin      ┆ (792)727-8878x82819  │
│ 6   ┆ Patricia Brown          ┆ 363-719-4855x7138    │
│ 7   ┆ Richard Hodges          ┆ (831)255-4407x13496  │
│ 8   ┆ Steve Green             ┆ 671-700-3796x127     │
│ 9   ┆ Luis Smith              ┆ 844.513.7054x4915    │
│ 10  ┆ Morgan Hensley          ┆ 001-741-524-3690x958 │
└─────┴─────────────────────────┴──────────────────────┘
--- ith_df = 2
shape: (10, 3)
┌─────┬────────────────────┬────────────────────────┐
│ id  ┆ name               ┆ phone                  │
│ --- ┆ ---                ┆ ---                    │
│ i64 ┆ str                ┆ str                    │
╞═════╪════════════════════╪════════════════════════╡
│ 11  ┆ Tanya Peck         ┆ 819-449-4406           │
│ 12  ┆ Mr. Joseph Parrish ┆ 409-954-9495x85424     │
│ 13  ┆ Juan Frazier       ┆ 282.948.5514x337       │
│ 14  ┆ David Murray       ┆ 902-780-8331x8359      │
│ 15  ┆ Ryan Campbell      ┆ 7713098760             │
│ 16  ┆ Debra Harris       ┆ 984.923.3706           │
│ 17  ┆ Andrew Lee         ┆ 754-587-2094           │
│ 18  ┆ James Miller       ┆ 369.515.1819x923       │
│ 19  ┆ Emma Gentry        ┆ 001-323-490-9924x59516 │
│ 20  ┆ Lindsay Meza       ┆ 466-888-0910x674       │
└─────┴────────────────────┴────────────────────────┘
--- ith_df = 3
shape: (10, 3)
┌─────┬────────────────────┬──────────────────────┐
│ id  ┆ name               ┆ phone                │
│ --- ┆ ---                ┆ ---                  │
│ i64 ┆ str                ┆ str                  │
╞═════╪════════════════════╪══════════════════════╡
│ 21  ┆ Daniel Walker MD   ┆ +1-832-758-6101x9324 │
│ 22  ┆ Jon Howard         ┆ 666-699-5756x1435    │
│ 23  ┆ Shannon Johnson    ┆ 8148458256           │
│ 24  ┆ Erica Keller       ┆ 864.464.0633         │
│ 25  ┆ Betty Perez        ┆ 869.484.7373         │
│ 26  ┆ John Wiley Jr.     ┆ 882-603-1099x880     │
│ 27  ┆ Samantha Gutierrez ┆ (688)944-5982x705    │
│ 28  ┆ Matthew Tucker     ┆ +1-577-433-9373x084  │
│ 29  ┆ Jennifer Black     ┆ (832)819-8567        │
│ 30  ┆ Sarah Pittman      ┆ 6854545835           │
└─────┴────────────────────┴──────────────────────┘
--- ith_df = 4
shape: (10, 3)
┌─────┬────────────────────┬──────────────────────┐
│ id  ┆ name               ┆ phone                │
│ --- ┆ ---                ┆ ---                  │
│ i64 ┆ str                ┆ str                  │
╞═════╪════════════════════╪══════════════════════╡
│ 31  ┆ Dennis Vaughan     ┆ 6862893476           │
│ 32  ┆ Kevin Martinez     ┆ (419)553-1369        │
│ 33  ┆ Victor Norman      ┆ 303-258-9000x83809   │
│ 34  ┆ Daniel Murray      ┆ 675-839-2466         │
│ 35  ┆ Patricia Simmons   ┆ (816)705-8827x9632   │
│ 36  ┆ Emma Gomez         ┆ 653.227.9975         │
│ 37  ┆ Charles Robinson   ┆ +1-687-438-5234      │
│ 38  ┆ Anthony Smith      ┆ +1-323-237-2932      │
│ 39  ┆ Steven Bennett     ┆ 6106982086           │
│ 40  ┆ Francisco Gonzalez ┆ 001-427-821-4543x436 │
└─────┴────────────────────┴──────────────────────┘
--- ith_df = 5
shape: (10, 3)
┌─────┬───────────────────┬──────────────────────┐
│ id  ┆ name              ┆ phone                │
│ --- ┆ ---               ┆ ---                  │
│ i64 ┆ str               ┆ str                  │
╞═════╪═══════════════════╪══════════════════════╡
│ 41  ┆ Stephanie Shaffer ┆ 001-999-779-0085x959 │
│ 42  ┆ Tina Allen        ┆ 669.671.7571         │
│ 43  ┆ Matthew Harrell   ┆ 678.831.5948         │
│ 44  ┆ John Hill         ┆ 355.476.5055         │
│ 45  ┆ James Shea        ┆ +1-903-530-8220x480  │
│ 46  ┆ William Keller    ┆ (695)744-1587x0883   │
│ 47  ┆ Vickie Perez      ┆ +1-297-963-9194x6132 │
│ 48  ┆ William Howard    ┆ 630.391.0772x26252   │
│ 49  ┆ Laura Nash        ┆ (495)680-1361x6763   │
│ 50  ┆ Donald Johnson    ┆ (282)973-8109x024    │
└─────┴───────────────────┴──────────────────────┘

[24]:

# When total number of row is NOT multiplier of ``n_rows``
# It's ok that the last sub dataframe doesn't have enough rows
for ith_df, sub_df in enumerate(df.iter_slices(n_rows=13), start=1):
    print(f"--- {ith_df = }")
    print(sub_df)

--- ith_df = 1
shape: (13, 3)
┌─────┬─────────────────────────┬──────────────────────┐
│ id  ┆ name                    ┆ phone                │
│ --- ┆ ---                     ┆ ---                  │
│ i64 ┆ str                     ┆ str                  │
╞═════╪═════════════════════════╪══════════════════════╡
│ 1   ┆ Maria Powell            ┆ 836.602.8286x06763   │
│ 2   ┆ Jacqueline Fletcher PhD ┆ +1-801-639-6835      │
│ 3   ┆ Mary Wilson             ┆ (999)905-1935        │
│ 4   ┆ Kenneth Davis           ┆ 332-275-7129x0289    │
│ 5   ┆ Christopher Martin      ┆ (792)727-8878x82819  │
│ …   ┆ …                       ┆ …                    │
│ 9   ┆ Luis Smith              ┆ 844.513.7054x4915    │
│ 10  ┆ Morgan Hensley          ┆ 001-741-524-3690x958 │
│ 11  ┆ Tanya Peck              ┆ 819-449-4406         │
│ 12  ┆ Mr. Joseph Parrish      ┆ 409-954-9495x85424   │
│ 13  ┆ Juan Frazier            ┆ 282.948.5514x337     │
└─────┴─────────────────────────┴──────────────────────┘
--- ith_df = 2
shape: (13, 3)
┌─────┬─────────────────┬───────────────────┐
│ id  ┆ name            ┆ phone             │
│ --- ┆ ---             ┆ ---               │
│ i64 ┆ str             ┆ str               │
╞═════╪═════════════════╪═══════════════════╡
│ 14  ┆ David Murray    ┆ 902-780-8331x8359 │
│ 15  ┆ Ryan Campbell   ┆ 7713098760        │
│ 16  ┆ Debra Harris    ┆ 984.923.3706      │
│ 17  ┆ Andrew Lee      ┆ 754-587-2094      │
│ 18  ┆ James Miller    ┆ 369.515.1819x923  │
│ …   ┆ …               ┆ …                 │
│ 22  ┆ Jon Howard      ┆ 666-699-5756x1435 │
│ 23  ┆ Shannon Johnson ┆ 8148458256        │
│ 24  ┆ Erica Keller    ┆ 864.464.0633      │
│ 25  ┆ Betty Perez     ┆ 869.484.7373      │
│ 26  ┆ John Wiley Jr.  ┆ 882-603-1099x880  │
└─────┴─────────────────┴───────────────────┘
--- ith_df = 3
shape: (13, 3)
┌─────┬────────────────────┬─────────────────────┐
│ id  ┆ name               ┆ phone               │
│ --- ┆ ---                ┆ ---                 │
│ i64 ┆ str                ┆ str                 │
╞═════╪════════════════════╪═════════════════════╡
│ 27  ┆ Samantha Gutierrez ┆ (688)944-5982x705   │
│ 28  ┆ Matthew Tucker     ┆ +1-577-433-9373x084 │
│ 29  ┆ Jennifer Black     ┆ (832)819-8567       │
│ 30  ┆ Sarah Pittman      ┆ 6854545835          │
│ 31  ┆ Dennis Vaughan     ┆ 6862893476          │
│ …   ┆ …                  ┆ …                   │
│ 35  ┆ Patricia Simmons   ┆ (816)705-8827x9632  │
│ 36  ┆ Emma Gomez         ┆ 653.227.9975        │
│ 37  ┆ Charles Robinson   ┆ +1-687-438-5234     │
│ 38  ┆ Anthony Smith      ┆ +1-323-237-2932     │
│ 39  ┆ Steven Bennett     ┆ 6106982086          │
└─────┴────────────────────┴─────────────────────┘
--- ith_df = 4
shape: (11, 3)
┌─────┬────────────────────┬──────────────────────┐
│ id  ┆ name               ┆ phone                │
│ --- ┆ ---                ┆ ---                  │
│ i64 ┆ str                ┆ str                  │
╞═════╪════════════════════╪══════════════════════╡
│ 40  ┆ Francisco Gonzalez ┆ 001-427-821-4543x436 │
│ 41  ┆ Stephanie Shaffer  ┆ 001-999-779-0085x959 │
│ 42  ┆ Tina Allen         ┆ 669.671.7571         │
│ 43  ┆ Matthew Harrell    ┆ 678.831.5948         │
│ 44  ┆ John Hill          ┆ 355.476.5055         │
│ …   ┆ …                  ┆ …                    │
│ 46  ┆ William Keller     ┆ (695)744-1587x0883   │
│ 47  ┆ Vickie Perez       ┆ +1-297-963-9194x6132 │
│ 48  ┆ William Howard     ┆ 630.391.0772x26252   │
│ 49  ┆ Laura Nash         ┆ (495)680-1361x6763   │
│ 50  ┆ Donald Johnson     ┆ (282)973-8109x024    │
└─────┴────────────────────┴──────────────────────┘

Concatenate#

Concatenate Vertically (More rows, same columns)#

DataFrame.extend will edit the first DataFrame in-place!

[25]:

df1 = pl.DataFrame({"id": [1, 2, 3]})
df2 = pl.DataFrame({"id": [4, 5, 6]})
df1.extend(df2)

[25]:

shape: (6, 1)

id
i64
1
2
3
4
5
6

[26]:

df1

[26]:

shape: (6, 1)

id
i64
1
2
3
4
5
6

DataFrame.vstack DOES NOT edit the first DataFrame in-place!

[27]:

df1 = pl.DataFrame({"id": [1, 2, 3]})
df2 = pl.DataFrame({"id": [4, 5, 6]})
df1.vstack(df2)

[27]:

shape: (6, 1)

id
i64
1
2
3
4
5
6

[28]:

df1

[28]:

shape: (3, 1)

id
i64
1
2
3

[30]:

# It won't work because the columns are different
df1 = pl.DataFrame({"id": [1, 2, 3]})
df2 = pl.DataFrame({"id": [4, 5, 6], "name": ["d", "e", "f"]})
try:
    df1.vstack(df2)
except Exception as e:
    print(repr(e))

ShapeError('unable to append to a DataFrame of width 1 with a DataFrame of width 2')

Concatenate Horizontally (More columns, same rows)#

DataFrame.hstack DOES NOT edit the first DataFrame in-place!

[31]:

df1 = pl.DataFrame({"id": [1, 2, 3]})
df2 = pl.DataFrame({"name": ["a", "b", "c"]})
df1.hstack(df2)

[31]:

shape: (3, 2)

id	name
i64	str
1	"a"
2	"b"
3	"c"

[32]:

df1

[32]:

shape: (3, 1)

id
i64
1
2
3

[33]:

# It won't work because the rows are different
df1 = pl.DataFrame({"id": [1, 2, 3]})
df2 = pl.DataFrame({"name": ["a", "b", "c", "d", "e"]})
try:
    df1.hstack(df2)
except Exception as e:
    print(repr(e))

ShapeError('could not create a new DataFrame: series "id" has length 3 while series "name" has length 5')

[ ]: