Manipulation Selection#

Ref:

[2]:
import polars as pl
import faker

fake = faker.Faker()

n_row = 50
data = {
    "id": list(range(1, 1+n_row)),
    "name": [fake.name() for _ in range(n_row)],
    "phone": [fake.phone_number() for _ in range(n_row)],
}
df = pl.DataFrame(data)
df
[2]:
shape: (50, 3)
idnamephone
i64strstr
1"Maria Powell""836.602.8286x06763"
2"Jacqueline Fletcher PhD""+1-801-639-6835"
3"Mary Wilson""(999)905-1935"
4"Kenneth Davis""332-275-7129x0289"
5"Christopher Martin""(792)727-8878x82819"
46"William Keller""(695)744-1587x0883"
47"Vickie Perez""+1-297-963-9194x6132"
48"William Howard""630.391.0772x26252"
49"Laura Nash""(495)680-1361x6763"
50"Donald Johnson""(282)973-8109x024"

Select by Rows#

Take First N#

[3]:
df.head(5)
[3]:
shape: (5, 3)
idnamephone
i64strstr
1"Maria Powell""836.602.8286x06763"
2"Jacqueline Fletcher PhD""+1-801-639-6835"
3"Mary Wilson""(999)905-1935"
4"Kenneth Davis""332-275-7129x0289"
5"Christopher Martin""(792)727-8878x82819"

Take Last N#

[4]:
df.tail(5)
[4]:
shape: (5, 3)
idnamephone
i64strstr
46"William Keller""(695)744-1587x0883"
47"Vickie Perez""+1-297-963-9194x6132"
48"William Howard""630.391.0772x26252"
49"Laura Nash""(495)680-1361x6763"
50"Donald Johnson""(282)973-8109x024"

Select By Index Range#

Note: Polars does not have a multi-index/index, you have to use integer index to select rows.

[5]:
# Method 1
df.slice(offset=5, length=10)
[5]:
shape: (10, 3)
idnamephone
i64strstr
6"Patricia Brown""363-719-4855x7138"
7"Richard Hodges""(831)255-4407x13496"
8"Steve Green""671-700-3796x127"
9"Luis Smith""844.513.7054x4915"
10"Morgan Hensley""001-741-524-3690x958"
11"Tanya Peck""819-449-4406"
12"Mr. Joseph Parrish""409-954-9495x85424"
13"Juan Frazier""282.948.5514x337"
14"David Murray""902-780-8331x8359"
15"Ryan Campbell""7713098760"
[6]:
# Method 2
start_idx = 6
end_idx = 15
df.slice(offset=start_idx-1, length=(end_idx-start_idx+1))
[6]:
shape: (10, 3)
idnamephone
i64strstr
6"Patricia Brown""363-719-4855x7138"
7"Richard Hodges""(831)255-4407x13496"
8"Steve Green""671-700-3796x127"
9"Luis Smith""844.513.7054x4915"
10"Morgan Hensley""001-741-524-3690x958"
11"Tanya Peck""819-449-4406"
12"Mr. Joseph Parrish""409-954-9495x85424"
13"Juan Frazier""282.948.5514x337"
14"David Murray""902-780-8331x8359"
15"Ryan Campbell""7713098760"

Select One Row By Index#

[7]:
# Don't do df[5], it will return a dataframe
df.row(5)
[7]:
(6, 'Patricia Brown', '363-719-4855x7138')
[8]:
df.row(5, named=True)
[8]:
{'id': 6, 'name': 'Patricia Brown', 'phone': '363-719-4855x7138'}

Select Rows by Multiple Index#

[9]:
df[[1, 3, 5]]
[9]:
shape: (3, 3)
idnamephone
i64strstr
2"Jacqueline Fletcher PhD""+1-801-639-6835"
4"Kenneth Davis""332-275-7129x0289"
6"Patricia Brown""363-719-4855x7138"

Randomly Sample By Rows#

[10]:
df.sample(n=5)
[10]:
shape: (5, 3)
idnamephone
i64strstr
24"Erica Keller""864.464.0633"
46"William Keller""(695)744-1587x0883"
18"James Miller""369.515.1819x923"
21"Daniel Walker MD""+1-832-758-6101x9324"
12"Mr. Joseph Parrish""409-954-9495x85424"
[11]:
df.sample(fraction=0.1)
[11]:
shape: (5, 3)
idnamephone
i64strstr
30"Sarah Pittman""6854545835"
11"Tanya Peck""819-449-4406"
8"Steve Green""671-700-3796x127"
17"Andrew Lee""754-587-2094"
40"Francisco Gonzalez""001-427-821-4543x436"
[12]:
df_res = df.sample(fraction=0.5, with_replacement=True)
df_res
[12]:
shape: (25, 3)
idnamephone
i64strstr
14"David Murray""902-780-8331x8359"
12"Mr. Joseph Parrish""409-954-9495x85424"
24"Erica Keller""864.464.0633"
20"Lindsay Meza""466-888-0910x674"
35"Patricia Simmons""(816)705-8827x9632"
24"Erica Keller""864.464.0633"
24"Erica Keller""864.464.0633"
10"Morgan Hensley""001-741-524-3690x958"
42"Tina Allen""669.671.7571"
50"Donald Johnson""(282)973-8109x024"
[13]:
# most likely it won't be n_row / 2
df_res.n_unique()
[13]:
16

Select By Columns#

Select one Column#

[14]:
df.select("name")
[14]:
shape: (50, 1)
name
str
"Maria Powell"
"Jacqueline Fletcher PhD"
"Mary Wilson"
"Kenneth Davis"
"Christopher Martin"
"William Keller"
"Vickie Perez"
"William Howard"
"Laura Nash"
"Donald Johnson"

Select multiple Column#

[15]:
df.select(["id", "name"])
[15]:
shape: (50, 2)
idname
i64str
1"Maria Powell"
2"Jacqueline Fletcher PhD"
3"Mary Wilson"
4"Kenneth Davis"
5"Christopher Martin"
46"William Keller"
47"Vickie Perez"
48"William Howard"
49"Laura Nash"
50"Donald Johnson"
[16]:
columns = list(df.schema)
df.select(columns[:2])
[16]:
shape: (50, 2)
idname
i64str
1"Maria Powell"
2"Jacqueline Fletcher PhD"
3"Mary Wilson"
4"Kenneth Davis"
5"Christopher Martin"
46"William Keller"
47"Vickie Perez"
48"William Howard"
49"Laura Nash"
50"Donald Johnson"
[17]:
df.select(columns[-2:])
[17]:
shape: (50, 2)
namephone
strstr
"Maria Powell""836.602.8286x06763"
"Jacqueline Fletcher PhD""+1-801-639-6835"
"Mary Wilson""(999)905-1935"
"Kenneth Davis""332-275-7129x0289"
"Christopher Martin""(792)727-8878x82819"
"William Keller""(695)744-1587x0883"
"Vickie Perez""+1-297-963-9194x6132"
"William Howard""630.391.0772x26252"
"Laura Nash""(495)680-1361x6763"
"Donald Johnson""(282)973-8109x024"

Select By Both Rows and Columns#

[18]:
df.select(["id", "name"])[[1, 3, 5]]
[18]:
shape: (3, 2)
idname
i64str
2"Jacqueline Fletcher PhD"
4"Kenneth Davis"
6"Patricia Brown"

Select a Specific Cell#

[19]:
df.item(row=0, column="id")
[19]:
1
[20]:
df.item(row=0, column=0)
[20]:
1

Iterate Over Columns#

[21]:
for series in df.iter_columns():
    print(f"--- {series.name = }")
    print(f"{type(series) = }")
    print(f"{series = }")
--- series.name = 'id'
type(series) = <class 'polars.series.series.Series'>
series = shape: (50,)
Series: 'id' [i64]
[
        1
        2
        3
        4
        5
        …
        46
        47
        48
        49
        50
]
--- series.name = 'name'
type(series) = <class 'polars.series.series.Series'>
series = shape: (50,)
Series: 'name' [str]
[
        "Maria Powell"
        "Jacqueline Fletcher PhD"
        "Mary Wilson"
        "Kenneth Davis"
        "Christopher Martin"
        …
        "William Keller"
        "Vickie Perez"
        "William Howard"
        "Laura Nash"
        "Donald Johnson"
]
--- series.name = 'phone'
type(series) = <class 'polars.series.series.Series'>
series = shape: (50,)
Series: 'phone' [str]
[
        "836.602.8286x06763"
        "+1-801-639-6835"
        "(999)905-1935"
        "332-275-7129x0289"
        "(792)727-8878x82819"
        …
        "(695)744-1587x0883"
        "+1-297-963-9194x6132"
        "630.391.0772x26252"
        "(495)680-1361x6763"
        "(282)973-8109x024"
]

Iterate Over Rows#

[22]:
for row in df.iter_rows():
    print(f"--- {row[0] = }")
    print(f"{type(row) = }")
    print(f"{row = }")
--- row[0] = 1
type(row) = <class 'tuple'>
row = (1, 'Maria Powell', '836.602.8286x06763')
--- row[0] = 2
type(row) = <class 'tuple'>
row = (2, 'Jacqueline Fletcher PhD', '+1-801-639-6835')
--- row[0] = 3
type(row) = <class 'tuple'>
row = (3, 'Mary Wilson', '(999)905-1935')
--- row[0] = 4
type(row) = <class 'tuple'>
row = (4, 'Kenneth Davis', '332-275-7129x0289')
--- row[0] = 5
type(row) = <class 'tuple'>
row = (5, 'Christopher Martin', '(792)727-8878x82819')
--- row[0] = 6
type(row) = <class 'tuple'>
row = (6, 'Patricia Brown', '363-719-4855x7138')
--- row[0] = 7
type(row) = <class 'tuple'>
row = (7, 'Richard Hodges', '(831)255-4407x13496')
--- row[0] = 8
type(row) = <class 'tuple'>
row = (8, 'Steve Green', '671-700-3796x127')
--- row[0] = 9
type(row) = <class 'tuple'>
row = (9, 'Luis Smith', '844.513.7054x4915')
--- row[0] = 10
type(row) = <class 'tuple'>
row = (10, 'Morgan Hensley', '001-741-524-3690x958')
--- row[0] = 11
type(row) = <class 'tuple'>
row = (11, 'Tanya Peck', '819-449-4406')
--- row[0] = 12
type(row) = <class 'tuple'>
row = (12, 'Mr. Joseph Parrish', '409-954-9495x85424')
--- row[0] = 13
type(row) = <class 'tuple'>
row = (13, 'Juan Frazier', '282.948.5514x337')
--- row[0] = 14
type(row) = <class 'tuple'>
row = (14, 'David Murray', '902-780-8331x8359')
--- row[0] = 15
type(row) = <class 'tuple'>
row = (15, 'Ryan Campbell', '7713098760')
--- row[0] = 16
type(row) = <class 'tuple'>
row = (16, 'Debra Harris', '984.923.3706')
--- row[0] = 17
type(row) = <class 'tuple'>
row = (17, 'Andrew Lee', '754-587-2094')
--- row[0] = 18
type(row) = <class 'tuple'>
row = (18, 'James Miller', '369.515.1819x923')
--- row[0] = 19
type(row) = <class 'tuple'>
row = (19, 'Emma Gentry', '001-323-490-9924x59516')
--- row[0] = 20
type(row) = <class 'tuple'>
row = (20, 'Lindsay Meza', '466-888-0910x674')
--- row[0] = 21
type(row) = <class 'tuple'>
row = (21, 'Daniel Walker MD', '+1-832-758-6101x9324')
--- row[0] = 22
type(row) = <class 'tuple'>
row = (22, 'Jon Howard', '666-699-5756x1435')
--- row[0] = 23
type(row) = <class 'tuple'>
row = (23, 'Shannon Johnson', '8148458256')
--- row[0] = 24
type(row) = <class 'tuple'>
row = (24, 'Erica Keller', '864.464.0633')
--- row[0] = 25
type(row) = <class 'tuple'>
row = (25, 'Betty Perez', '869.484.7373')
--- row[0] = 26
type(row) = <class 'tuple'>
row = (26, 'John Wiley Jr.', '882-603-1099x880')
--- row[0] = 27
type(row) = <class 'tuple'>
row = (27, 'Samantha Gutierrez', '(688)944-5982x705')
--- row[0] = 28
type(row) = <class 'tuple'>
row = (28, 'Matthew Tucker', '+1-577-433-9373x084')
--- row[0] = 29
type(row) = <class 'tuple'>
row = (29, 'Jennifer Black', '(832)819-8567')
--- row[0] = 30
type(row) = <class 'tuple'>
row = (30, 'Sarah Pittman', '6854545835')
--- row[0] = 31
type(row) = <class 'tuple'>
row = (31, 'Dennis Vaughan', '6862893476')
--- row[0] = 32
type(row) = <class 'tuple'>
row = (32, 'Kevin Martinez', '(419)553-1369')
--- row[0] = 33
type(row) = <class 'tuple'>
row = (33, 'Victor Norman', '303-258-9000x83809')
--- row[0] = 34
type(row) = <class 'tuple'>
row = (34, 'Daniel Murray', '675-839-2466')
--- row[0] = 35
type(row) = <class 'tuple'>
row = (35, 'Patricia Simmons', '(816)705-8827x9632')
--- row[0] = 36
type(row) = <class 'tuple'>
row = (36, 'Emma Gomez', '653.227.9975')
--- row[0] = 37
type(row) = <class 'tuple'>
row = (37, 'Charles Robinson', '+1-687-438-5234')
--- row[0] = 38
type(row) = <class 'tuple'>
row = (38, 'Anthony Smith', '+1-323-237-2932')
--- row[0] = 39
type(row) = <class 'tuple'>
row = (39, 'Steven Bennett', '6106982086')
--- row[0] = 40
type(row) = <class 'tuple'>
row = (40, 'Francisco Gonzalez', '001-427-821-4543x436')
--- row[0] = 41
type(row) = <class 'tuple'>
row = (41, 'Stephanie Shaffer', '001-999-779-0085x959')
--- row[0] = 42
type(row) = <class 'tuple'>
row = (42, 'Tina Allen', '669.671.7571')
--- row[0] = 43
type(row) = <class 'tuple'>
row = (43, 'Matthew Harrell', '678.831.5948')
--- row[0] = 44
type(row) = <class 'tuple'>
row = (44, 'John Hill', '355.476.5055')
--- row[0] = 45
type(row) = <class 'tuple'>
row = (45, 'James Shea', '+1-903-530-8220x480')
--- row[0] = 46
type(row) = <class 'tuple'>
row = (46, 'William Keller', '(695)744-1587x0883')
--- row[0] = 47
type(row) = <class 'tuple'>
row = (47, 'Vickie Perez', '+1-297-963-9194x6132')
--- row[0] = 48
type(row) = <class 'tuple'>
row = (48, 'William Howard', '630.391.0772x26252')
--- row[0] = 49
type(row) = <class 'tuple'>
row = (49, 'Laura Nash', '(495)680-1361x6763')
--- row[0] = 50
type(row) = <class 'tuple'>
row = (50, 'Donald Johnson', '(282)973-8109x024')

Iterate Over Slices#

Sub DataFrames with a fewer rows

[23]:
# When total number of row is multiplier of ``n_rows``
for ith_df, sub_df in enumerate(df.iter_slices(n_rows=n_row // 5), start=1):
    print(f"--- {ith_df = }")
    print(sub_df)
--- ith_df = 1
shape: (10, 3)
┌─────┬─────────────────────────┬──────────────────────┐
│ id  ┆ name                    ┆ phone                │
│ --- ┆ ---                     ┆ ---                  │
│ i64 ┆ str                     ┆ str                  │
╞═════╪═════════════════════════╪══════════════════════╡
│ 1   ┆ Maria Powell            ┆ 836.602.8286x06763   │
│ 2   ┆ Jacqueline Fletcher PhD ┆ +1-801-639-6835      │
│ 3   ┆ Mary Wilson             ┆ (999)905-1935        │
│ 4   ┆ Kenneth Davis           ┆ 332-275-7129x0289    │
│ 5   ┆ Christopher Martin      ┆ (792)727-8878x82819  │
│ 6   ┆ Patricia Brown          ┆ 363-719-4855x7138    │
│ 7   ┆ Richard Hodges          ┆ (831)255-4407x13496  │
│ 8   ┆ Steve Green             ┆ 671-700-3796x127     │
│ 9   ┆ Luis Smith              ┆ 844.513.7054x4915    │
│ 10  ┆ Morgan Hensley          ┆ 001-741-524-3690x958 │
└─────┴─────────────────────────┴──────────────────────┘
--- ith_df = 2
shape: (10, 3)
┌─────┬────────────────────┬────────────────────────┐
│ id  ┆ name               ┆ phone                  │
│ --- ┆ ---                ┆ ---                    │
│ i64 ┆ str                ┆ str                    │
╞═════╪════════════════════╪════════════════════════╡
│ 11  ┆ Tanya Peck         ┆ 819-449-4406           │
│ 12  ┆ Mr. Joseph Parrish ┆ 409-954-9495x85424     │
│ 13  ┆ Juan Frazier       ┆ 282.948.5514x337       │
│ 14  ┆ David Murray       ┆ 902-780-8331x8359      │
│ 15  ┆ Ryan Campbell      ┆ 7713098760             │
│ 16  ┆ Debra Harris       ┆ 984.923.3706           │
│ 17  ┆ Andrew Lee         ┆ 754-587-2094           │
│ 18  ┆ James Miller       ┆ 369.515.1819x923       │
│ 19  ┆ Emma Gentry        ┆ 001-323-490-9924x59516 │
│ 20  ┆ Lindsay Meza       ┆ 466-888-0910x674       │
└─────┴────────────────────┴────────────────────────┘
--- ith_df = 3
shape: (10, 3)
┌─────┬────────────────────┬──────────────────────┐
│ id  ┆ name               ┆ phone                │
│ --- ┆ ---                ┆ ---                  │
│ i64 ┆ str                ┆ str                  │
╞═════╪════════════════════╪══════════════════════╡
│ 21  ┆ Daniel Walker MD   ┆ +1-832-758-6101x9324 │
│ 22  ┆ Jon Howard         ┆ 666-699-5756x1435    │
│ 23  ┆ Shannon Johnson    ┆ 8148458256           │
│ 24  ┆ Erica Keller       ┆ 864.464.0633         │
│ 25  ┆ Betty Perez        ┆ 869.484.7373         │
│ 26  ┆ John Wiley Jr.     ┆ 882-603-1099x880     │
│ 27  ┆ Samantha Gutierrez ┆ (688)944-5982x705    │
│ 28  ┆ Matthew Tucker     ┆ +1-577-433-9373x084  │
│ 29  ┆ Jennifer Black     ┆ (832)819-8567        │
│ 30  ┆ Sarah Pittman      ┆ 6854545835           │
└─────┴────────────────────┴──────────────────────┘
--- ith_df = 4
shape: (10, 3)
┌─────┬────────────────────┬──────────────────────┐
│ id  ┆ name               ┆ phone                │
│ --- ┆ ---                ┆ ---                  │
│ i64 ┆ str                ┆ str                  │
╞═════╪════════════════════╪══════════════════════╡
│ 31  ┆ Dennis Vaughan     ┆ 6862893476           │
│ 32  ┆ Kevin Martinez     ┆ (419)553-1369        │
│ 33  ┆ Victor Norman      ┆ 303-258-9000x83809   │
│ 34  ┆ Daniel Murray      ┆ 675-839-2466         │
│ 35  ┆ Patricia Simmons   ┆ (816)705-8827x9632   │
│ 36  ┆ Emma Gomez         ┆ 653.227.9975         │
│ 37  ┆ Charles Robinson   ┆ +1-687-438-5234      │
│ 38  ┆ Anthony Smith      ┆ +1-323-237-2932      │
│ 39  ┆ Steven Bennett     ┆ 6106982086           │
│ 40  ┆ Francisco Gonzalez ┆ 001-427-821-4543x436 │
└─────┴────────────────────┴──────────────────────┘
--- ith_df = 5
shape: (10, 3)
┌─────┬───────────────────┬──────────────────────┐
│ id  ┆ name              ┆ phone                │
│ --- ┆ ---               ┆ ---                  │
│ i64 ┆ str               ┆ str                  │
╞═════╪═══════════════════╪══════════════════════╡
│ 41  ┆ Stephanie Shaffer ┆ 001-999-779-0085x959 │
│ 42  ┆ Tina Allen        ┆ 669.671.7571         │
│ 43  ┆ Matthew Harrell   ┆ 678.831.5948         │
│ 44  ┆ John Hill         ┆ 355.476.5055         │
│ 45  ┆ James Shea        ┆ +1-903-530-8220x480  │
│ 46  ┆ William Keller    ┆ (695)744-1587x0883   │
│ 47  ┆ Vickie Perez      ┆ +1-297-963-9194x6132 │
│ 48  ┆ William Howard    ┆ 630.391.0772x26252   │
│ 49  ┆ Laura Nash        ┆ (495)680-1361x6763   │
│ 50  ┆ Donald Johnson    ┆ (282)973-8109x024    │
└─────┴───────────────────┴──────────────────────┘
[24]:
# When total number of row is NOT multiplier of ``n_rows``
# It's ok that the last sub dataframe doesn't have enough rows
for ith_df, sub_df in enumerate(df.iter_slices(n_rows=13), start=1):
    print(f"--- {ith_df = }")
    print(sub_df)
--- ith_df = 1
shape: (13, 3)
┌─────┬─────────────────────────┬──────────────────────┐
│ id  ┆ name                    ┆ phone                │
│ --- ┆ ---                     ┆ ---                  │
│ i64 ┆ str                     ┆ str                  │
╞═════╪═════════════════════════╪══════════════════════╡
│ 1   ┆ Maria Powell            ┆ 836.602.8286x06763   │
│ 2   ┆ Jacqueline Fletcher PhD ┆ +1-801-639-6835      │
│ 3   ┆ Mary Wilson             ┆ (999)905-1935        │
│ 4   ┆ Kenneth Davis           ┆ 332-275-7129x0289    │
│ 5   ┆ Christopher Martin      ┆ (792)727-8878x82819  │
│ …   ┆ …                       ┆ …                    │
│ 9   ┆ Luis Smith              ┆ 844.513.7054x4915    │
│ 10  ┆ Morgan Hensley          ┆ 001-741-524-3690x958 │
│ 11  ┆ Tanya Peck              ┆ 819-449-4406         │
│ 12  ┆ Mr. Joseph Parrish      ┆ 409-954-9495x85424   │
│ 13  ┆ Juan Frazier            ┆ 282.948.5514x337     │
└─────┴─────────────────────────┴──────────────────────┘
--- ith_df = 2
shape: (13, 3)
┌─────┬─────────────────┬───────────────────┐
│ id  ┆ name            ┆ phone             │
│ --- ┆ ---             ┆ ---               │
│ i64 ┆ str             ┆ str               │
╞═════╪═════════════════╪═══════════════════╡
│ 14  ┆ David Murray    ┆ 902-780-8331x8359 │
│ 15  ┆ Ryan Campbell   ┆ 7713098760        │
│ 16  ┆ Debra Harris    ┆ 984.923.3706      │
│ 17  ┆ Andrew Lee      ┆ 754-587-2094      │
│ 18  ┆ James Miller    ┆ 369.515.1819x923  │
│ …   ┆ …               ┆ …                 │
│ 22  ┆ Jon Howard      ┆ 666-699-5756x1435 │
│ 23  ┆ Shannon Johnson ┆ 8148458256        │
│ 24  ┆ Erica Keller    ┆ 864.464.0633      │
│ 25  ┆ Betty Perez     ┆ 869.484.7373      │
│ 26  ┆ John Wiley Jr.  ┆ 882-603-1099x880  │
└─────┴─────────────────┴───────────────────┘
--- ith_df = 3
shape: (13, 3)
┌─────┬────────────────────┬─────────────────────┐
│ id  ┆ name               ┆ phone               │
│ --- ┆ ---                ┆ ---                 │
│ i64 ┆ str                ┆ str                 │
╞═════╪════════════════════╪═════════════════════╡
│ 27  ┆ Samantha Gutierrez ┆ (688)944-5982x705   │
│ 28  ┆ Matthew Tucker     ┆ +1-577-433-9373x084 │
│ 29  ┆ Jennifer Black     ┆ (832)819-8567       │
│ 30  ┆ Sarah Pittman      ┆ 6854545835          │
│ 31  ┆ Dennis Vaughan     ┆ 6862893476          │
│ …   ┆ …                  ┆ …                   │
│ 35  ┆ Patricia Simmons   ┆ (816)705-8827x9632  │
│ 36  ┆ Emma Gomez         ┆ 653.227.9975        │
│ 37  ┆ Charles Robinson   ┆ +1-687-438-5234     │
│ 38  ┆ Anthony Smith      ┆ +1-323-237-2932     │
│ 39  ┆ Steven Bennett     ┆ 6106982086          │
└─────┴────────────────────┴─────────────────────┘
--- ith_df = 4
shape: (11, 3)
┌─────┬────────────────────┬──────────────────────┐
│ id  ┆ name               ┆ phone                │
│ --- ┆ ---                ┆ ---                  │
│ i64 ┆ str                ┆ str                  │
╞═════╪════════════════════╪══════════════════════╡
│ 40  ┆ Francisco Gonzalez ┆ 001-427-821-4543x436 │
│ 41  ┆ Stephanie Shaffer  ┆ 001-999-779-0085x959 │
│ 42  ┆ Tina Allen         ┆ 669.671.7571         │
│ 43  ┆ Matthew Harrell    ┆ 678.831.5948         │
│ 44  ┆ John Hill          ┆ 355.476.5055         │
│ …   ┆ …                  ┆ …                    │
│ 46  ┆ William Keller     ┆ (695)744-1587x0883   │
│ 47  ┆ Vickie Perez       ┆ +1-297-963-9194x6132 │
│ 48  ┆ William Howard     ┆ 630.391.0772x26252   │
│ 49  ┆ Laura Nash         ┆ (495)680-1361x6763   │
│ 50  ┆ Donald Johnson     ┆ (282)973-8109x024    │
└─────┴────────────────────┴──────────────────────┘

Concatenate#

Concatenate Vertically (More rows, same columns)#

DataFrame.extend will edit the first DataFrame in-place!

[25]:
df1 = pl.DataFrame({"id": [1, 2, 3]})
df2 = pl.DataFrame({"id": [4, 5, 6]})
df1.extend(df2)
[25]:
shape: (6, 1)
id
i64
1
2
3
4
5
6
[26]:
df1
[26]:
shape: (6, 1)
id
i64
1
2
3
4
5
6

DataFrame.vstack DOES NOT edit the first DataFrame in-place!

[27]:
df1 = pl.DataFrame({"id": [1, 2, 3]})
df2 = pl.DataFrame({"id": [4, 5, 6]})
df1.vstack(df2)
[27]:
shape: (6, 1)
id
i64
1
2
3
4
5
6
[28]:
df1
[28]:
shape: (3, 1)
id
i64
1
2
3
[30]:
# It won't work because the columns are different
df1 = pl.DataFrame({"id": [1, 2, 3]})
df2 = pl.DataFrame({"id": [4, 5, 6], "name": ["d", "e", "f"]})
try:
    df1.vstack(df2)
except Exception as e:
    print(repr(e))
ShapeError('unable to append to a DataFrame of width 1 with a DataFrame of width 2')

Concatenate Horizontally (More columns, same rows)#

DataFrame.hstack DOES NOT edit the first DataFrame in-place!

[31]:
df1 = pl.DataFrame({"id": [1, 2, 3]})
df2 = pl.DataFrame({"name": ["a", "b", "c"]})
df1.hstack(df2)
[31]:
shape: (3, 2)
idname
i64str
1"a"
2"b"
3"c"
[32]:
df1
[32]:
shape: (3, 1)
id
i64
1
2
3
[33]:
# It won't work because the rows are different
df1 = pl.DataFrame({"id": [1, 2, 3]})
df2 = pl.DataFrame({"name": ["a", "b", "c", "d", "e"]})
try:
    df1.hstack(df2)
except Exception as e:
    print(repr(e))
ShapeError('could not create a new DataFrame: series "id" has length 3 while series "name" has length 5')
[ ]: