Manipulation Selection#
Ref:
[2]:
import polars as pl
import faker
fake = faker.Faker()
n_row = 50
data = {
"id": list(range(1, 1+n_row)),
"name": [fake.name() for _ in range(n_row)],
"phone": [fake.phone_number() for _ in range(n_row)],
}
df = pl.DataFrame(data)
df
[2]:
shape: (50, 3)
| id | name | phone |
|---|---|---|
| i64 | str | str |
| 1 | "Maria Powell" | "836.602.8286x06763" |
| 2 | "Jacqueline Fletcher PhD" | "+1-801-639-6835" |
| 3 | "Mary Wilson" | "(999)905-1935" |
| 4 | "Kenneth Davis" | "332-275-7129x0289" |
| 5 | "Christopher Martin" | "(792)727-8878x82819" |
| … | … | … |
| 46 | "William Keller" | "(695)744-1587x0883" |
| 47 | "Vickie Perez" | "+1-297-963-9194x6132" |
| 48 | "William Howard" | "630.391.0772x26252" |
| 49 | "Laura Nash" | "(495)680-1361x6763" |
| 50 | "Donald Johnson" | "(282)973-8109x024" |
Select by Rows#
Take First N#
[3]:
df.head(5)
[3]:
shape: (5, 3)
| id | name | phone |
|---|---|---|
| i64 | str | str |
| 1 | "Maria Powell" | "836.602.8286x06763" |
| 2 | "Jacqueline Fletcher PhD" | "+1-801-639-6835" |
| 3 | "Mary Wilson" | "(999)905-1935" |
| 4 | "Kenneth Davis" | "332-275-7129x0289" |
| 5 | "Christopher Martin" | "(792)727-8878x82819" |
Take Last N#
[4]:
df.tail(5)
[4]:
shape: (5, 3)
| id | name | phone |
|---|---|---|
| i64 | str | str |
| 46 | "William Keller" | "(695)744-1587x0883" |
| 47 | "Vickie Perez" | "+1-297-963-9194x6132" |
| 48 | "William Howard" | "630.391.0772x26252" |
| 49 | "Laura Nash" | "(495)680-1361x6763" |
| 50 | "Donald Johnson" | "(282)973-8109x024" |
Select By Index Range#
Note: Polars does not have a multi-index/index, you have to use integer index to select rows.
[5]:
# Method 1
df.slice(offset=5, length=10)
[5]:
shape: (10, 3)
| id | name | phone |
|---|---|---|
| i64 | str | str |
| 6 | "Patricia Brown" | "363-719-4855x7138" |
| 7 | "Richard Hodges" | "(831)255-4407x13496" |
| 8 | "Steve Green" | "671-700-3796x127" |
| 9 | "Luis Smith" | "844.513.7054x4915" |
| 10 | "Morgan Hensley" | "001-741-524-3690x958" |
| 11 | "Tanya Peck" | "819-449-4406" |
| 12 | "Mr. Joseph Parrish" | "409-954-9495x85424" |
| 13 | "Juan Frazier" | "282.948.5514x337" |
| 14 | "David Murray" | "902-780-8331x8359" |
| 15 | "Ryan Campbell" | "7713098760" |
[6]:
# Method 2
start_idx = 6
end_idx = 15
df.slice(offset=start_idx-1, length=(end_idx-start_idx+1))
[6]:
shape: (10, 3)
| id | name | phone |
|---|---|---|
| i64 | str | str |
| 6 | "Patricia Brown" | "363-719-4855x7138" |
| 7 | "Richard Hodges" | "(831)255-4407x13496" |
| 8 | "Steve Green" | "671-700-3796x127" |
| 9 | "Luis Smith" | "844.513.7054x4915" |
| 10 | "Morgan Hensley" | "001-741-524-3690x958" |
| 11 | "Tanya Peck" | "819-449-4406" |
| 12 | "Mr. Joseph Parrish" | "409-954-9495x85424" |
| 13 | "Juan Frazier" | "282.948.5514x337" |
| 14 | "David Murray" | "902-780-8331x8359" |
| 15 | "Ryan Campbell" | "7713098760" |
Select One Row By Index#
[7]:
# Don't do df[5], it will return a dataframe
df.row(5)
[7]:
(6, 'Patricia Brown', '363-719-4855x7138')
[8]:
df.row(5, named=True)
[8]:
{'id': 6, 'name': 'Patricia Brown', 'phone': '363-719-4855x7138'}
Select Rows by Multiple Index#
[9]:
df[[1, 3, 5]]
[9]:
shape: (3, 3)
| id | name | phone |
|---|---|---|
| i64 | str | str |
| 2 | "Jacqueline Fletcher PhD" | "+1-801-639-6835" |
| 4 | "Kenneth Davis" | "332-275-7129x0289" |
| 6 | "Patricia Brown" | "363-719-4855x7138" |
Randomly Sample By Rows#
[10]:
df.sample(n=5)
[10]:
shape: (5, 3)
| id | name | phone |
|---|---|---|
| i64 | str | str |
| 24 | "Erica Keller" | "864.464.0633" |
| 46 | "William Keller" | "(695)744-1587x0883" |
| 18 | "James Miller" | "369.515.1819x923" |
| 21 | "Daniel Walker MD" | "+1-832-758-6101x9324" |
| 12 | "Mr. Joseph Parrish" | "409-954-9495x85424" |
[11]:
df.sample(fraction=0.1)
[11]:
shape: (5, 3)
| id | name | phone |
|---|---|---|
| i64 | str | str |
| 30 | "Sarah Pittman" | "6854545835" |
| 11 | "Tanya Peck" | "819-449-4406" |
| 8 | "Steve Green" | "671-700-3796x127" |
| 17 | "Andrew Lee" | "754-587-2094" |
| 40 | "Francisco Gonzalez" | "001-427-821-4543x436" |
[12]:
df_res = df.sample(fraction=0.5, with_replacement=True)
df_res
[12]:
shape: (25, 3)
| id | name | phone |
|---|---|---|
| i64 | str | str |
| 14 | "David Murray" | "902-780-8331x8359" |
| 12 | "Mr. Joseph Parrish" | "409-954-9495x85424" |
| 24 | "Erica Keller" | "864.464.0633" |
| 20 | "Lindsay Meza" | "466-888-0910x674" |
| 35 | "Patricia Simmons" | "(816)705-8827x9632" |
| … | … | … |
| 24 | "Erica Keller" | "864.464.0633" |
| 24 | "Erica Keller" | "864.464.0633" |
| 10 | "Morgan Hensley" | "001-741-524-3690x958" |
| 42 | "Tina Allen" | "669.671.7571" |
| 50 | "Donald Johnson" | "(282)973-8109x024" |
[13]:
# most likely it won't be n_row / 2
df_res.n_unique()
[13]:
16
Select By Columns#
Select one Column#
[14]:
df.select("name")
[14]:
shape: (50, 1)
| name |
|---|
| str |
| "Maria Powell" |
| "Jacqueline Fletcher PhD" |
| "Mary Wilson" |
| "Kenneth Davis" |
| "Christopher Martin" |
| … |
| "William Keller" |
| "Vickie Perez" |
| "William Howard" |
| "Laura Nash" |
| "Donald Johnson" |
Select multiple Column#
[15]:
df.select(["id", "name"])
[15]:
shape: (50, 2)
| id | name |
|---|---|
| i64 | str |
| 1 | "Maria Powell" |
| 2 | "Jacqueline Fletcher PhD" |
| 3 | "Mary Wilson" |
| 4 | "Kenneth Davis" |
| 5 | "Christopher Martin" |
| … | … |
| 46 | "William Keller" |
| 47 | "Vickie Perez" |
| 48 | "William Howard" |
| 49 | "Laura Nash" |
| 50 | "Donald Johnson" |
[16]:
columns = list(df.schema)
df.select(columns[:2])
[16]:
shape: (50, 2)
| id | name |
|---|---|
| i64 | str |
| 1 | "Maria Powell" |
| 2 | "Jacqueline Fletcher PhD" |
| 3 | "Mary Wilson" |
| 4 | "Kenneth Davis" |
| 5 | "Christopher Martin" |
| … | … |
| 46 | "William Keller" |
| 47 | "Vickie Perez" |
| 48 | "William Howard" |
| 49 | "Laura Nash" |
| 50 | "Donald Johnson" |
[17]:
df.select(columns[-2:])
[17]:
shape: (50, 2)
| name | phone |
|---|---|
| str | str |
| "Maria Powell" | "836.602.8286x06763" |
| "Jacqueline Fletcher PhD" | "+1-801-639-6835" |
| "Mary Wilson" | "(999)905-1935" |
| "Kenneth Davis" | "332-275-7129x0289" |
| "Christopher Martin" | "(792)727-8878x82819" |
| … | … |
| "William Keller" | "(695)744-1587x0883" |
| "Vickie Perez" | "+1-297-963-9194x6132" |
| "William Howard" | "630.391.0772x26252" |
| "Laura Nash" | "(495)680-1361x6763" |
| "Donald Johnson" | "(282)973-8109x024" |
Select By Both Rows and Columns#
[18]:
df.select(["id", "name"])[[1, 3, 5]]
[18]:
shape: (3, 2)
| id | name |
|---|---|
| i64 | str |
| 2 | "Jacqueline Fletcher PhD" |
| 4 | "Kenneth Davis" |
| 6 | "Patricia Brown" |
Select a Specific Cell#
[19]:
df.item(row=0, column="id")
[19]:
1
[20]:
df.item(row=0, column=0)
[20]:
1
Iterate Over Columns#
[21]:
for series in df.iter_columns():
print(f"--- {series.name = }")
print(f"{type(series) = }")
print(f"{series = }")
--- series.name = 'id'
type(series) = <class 'polars.series.series.Series'>
series = shape: (50,)
Series: 'id' [i64]
[
1
2
3
4
5
…
46
47
48
49
50
]
--- series.name = 'name'
type(series) = <class 'polars.series.series.Series'>
series = shape: (50,)
Series: 'name' [str]
[
"Maria Powell"
"Jacqueline Fletcher PhD"
"Mary Wilson"
"Kenneth Davis"
"Christopher Martin"
…
"William Keller"
"Vickie Perez"
"William Howard"
"Laura Nash"
"Donald Johnson"
]
--- series.name = 'phone'
type(series) = <class 'polars.series.series.Series'>
series = shape: (50,)
Series: 'phone' [str]
[
"836.602.8286x06763"
"+1-801-639-6835"
"(999)905-1935"
"332-275-7129x0289"
"(792)727-8878x82819"
…
"(695)744-1587x0883"
"+1-297-963-9194x6132"
"630.391.0772x26252"
"(495)680-1361x6763"
"(282)973-8109x024"
]
Iterate Over Rows#
[22]:
for row in df.iter_rows():
print(f"--- {row[0] = }")
print(f"{type(row) = }")
print(f"{row = }")
--- row[0] = 1
type(row) = <class 'tuple'>
row = (1, 'Maria Powell', '836.602.8286x06763')
--- row[0] = 2
type(row) = <class 'tuple'>
row = (2, 'Jacqueline Fletcher PhD', '+1-801-639-6835')
--- row[0] = 3
type(row) = <class 'tuple'>
row = (3, 'Mary Wilson', '(999)905-1935')
--- row[0] = 4
type(row) = <class 'tuple'>
row = (4, 'Kenneth Davis', '332-275-7129x0289')
--- row[0] = 5
type(row) = <class 'tuple'>
row = (5, 'Christopher Martin', '(792)727-8878x82819')
--- row[0] = 6
type(row) = <class 'tuple'>
row = (6, 'Patricia Brown', '363-719-4855x7138')
--- row[0] = 7
type(row) = <class 'tuple'>
row = (7, 'Richard Hodges', '(831)255-4407x13496')
--- row[0] = 8
type(row) = <class 'tuple'>
row = (8, 'Steve Green', '671-700-3796x127')
--- row[0] = 9
type(row) = <class 'tuple'>
row = (9, 'Luis Smith', '844.513.7054x4915')
--- row[0] = 10
type(row) = <class 'tuple'>
row = (10, 'Morgan Hensley', '001-741-524-3690x958')
--- row[0] = 11
type(row) = <class 'tuple'>
row = (11, 'Tanya Peck', '819-449-4406')
--- row[0] = 12
type(row) = <class 'tuple'>
row = (12, 'Mr. Joseph Parrish', '409-954-9495x85424')
--- row[0] = 13
type(row) = <class 'tuple'>
row = (13, 'Juan Frazier', '282.948.5514x337')
--- row[0] = 14
type(row) = <class 'tuple'>
row = (14, 'David Murray', '902-780-8331x8359')
--- row[0] = 15
type(row) = <class 'tuple'>
row = (15, 'Ryan Campbell', '7713098760')
--- row[0] = 16
type(row) = <class 'tuple'>
row = (16, 'Debra Harris', '984.923.3706')
--- row[0] = 17
type(row) = <class 'tuple'>
row = (17, 'Andrew Lee', '754-587-2094')
--- row[0] = 18
type(row) = <class 'tuple'>
row = (18, 'James Miller', '369.515.1819x923')
--- row[0] = 19
type(row) = <class 'tuple'>
row = (19, 'Emma Gentry', '001-323-490-9924x59516')
--- row[0] = 20
type(row) = <class 'tuple'>
row = (20, 'Lindsay Meza', '466-888-0910x674')
--- row[0] = 21
type(row) = <class 'tuple'>
row = (21, 'Daniel Walker MD', '+1-832-758-6101x9324')
--- row[0] = 22
type(row) = <class 'tuple'>
row = (22, 'Jon Howard', '666-699-5756x1435')
--- row[0] = 23
type(row) = <class 'tuple'>
row = (23, 'Shannon Johnson', '8148458256')
--- row[0] = 24
type(row) = <class 'tuple'>
row = (24, 'Erica Keller', '864.464.0633')
--- row[0] = 25
type(row) = <class 'tuple'>
row = (25, 'Betty Perez', '869.484.7373')
--- row[0] = 26
type(row) = <class 'tuple'>
row = (26, 'John Wiley Jr.', '882-603-1099x880')
--- row[0] = 27
type(row) = <class 'tuple'>
row = (27, 'Samantha Gutierrez', '(688)944-5982x705')
--- row[0] = 28
type(row) = <class 'tuple'>
row = (28, 'Matthew Tucker', '+1-577-433-9373x084')
--- row[0] = 29
type(row) = <class 'tuple'>
row = (29, 'Jennifer Black', '(832)819-8567')
--- row[0] = 30
type(row) = <class 'tuple'>
row = (30, 'Sarah Pittman', '6854545835')
--- row[0] = 31
type(row) = <class 'tuple'>
row = (31, 'Dennis Vaughan', '6862893476')
--- row[0] = 32
type(row) = <class 'tuple'>
row = (32, 'Kevin Martinez', '(419)553-1369')
--- row[0] = 33
type(row) = <class 'tuple'>
row = (33, 'Victor Norman', '303-258-9000x83809')
--- row[0] = 34
type(row) = <class 'tuple'>
row = (34, 'Daniel Murray', '675-839-2466')
--- row[0] = 35
type(row) = <class 'tuple'>
row = (35, 'Patricia Simmons', '(816)705-8827x9632')
--- row[0] = 36
type(row) = <class 'tuple'>
row = (36, 'Emma Gomez', '653.227.9975')
--- row[0] = 37
type(row) = <class 'tuple'>
row = (37, 'Charles Robinson', '+1-687-438-5234')
--- row[0] = 38
type(row) = <class 'tuple'>
row = (38, 'Anthony Smith', '+1-323-237-2932')
--- row[0] = 39
type(row) = <class 'tuple'>
row = (39, 'Steven Bennett', '6106982086')
--- row[0] = 40
type(row) = <class 'tuple'>
row = (40, 'Francisco Gonzalez', '001-427-821-4543x436')
--- row[0] = 41
type(row) = <class 'tuple'>
row = (41, 'Stephanie Shaffer', '001-999-779-0085x959')
--- row[0] = 42
type(row) = <class 'tuple'>
row = (42, 'Tina Allen', '669.671.7571')
--- row[0] = 43
type(row) = <class 'tuple'>
row = (43, 'Matthew Harrell', '678.831.5948')
--- row[0] = 44
type(row) = <class 'tuple'>
row = (44, 'John Hill', '355.476.5055')
--- row[0] = 45
type(row) = <class 'tuple'>
row = (45, 'James Shea', '+1-903-530-8220x480')
--- row[0] = 46
type(row) = <class 'tuple'>
row = (46, 'William Keller', '(695)744-1587x0883')
--- row[0] = 47
type(row) = <class 'tuple'>
row = (47, 'Vickie Perez', '+1-297-963-9194x6132')
--- row[0] = 48
type(row) = <class 'tuple'>
row = (48, 'William Howard', '630.391.0772x26252')
--- row[0] = 49
type(row) = <class 'tuple'>
row = (49, 'Laura Nash', '(495)680-1361x6763')
--- row[0] = 50
type(row) = <class 'tuple'>
row = (50, 'Donald Johnson', '(282)973-8109x024')
Iterate Over Slices#
Sub DataFrames with a fewer rows
[23]:
# When total number of row is multiplier of ``n_rows``
for ith_df, sub_df in enumerate(df.iter_slices(n_rows=n_row // 5), start=1):
print(f"--- {ith_df = }")
print(sub_df)
--- ith_df = 1
shape: (10, 3)
┌─────┬─────────────────────────┬──────────────────────┐
│ id ┆ name ┆ phone │
│ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ str │
╞═════╪═════════════════════════╪══════════════════════╡
│ 1 ┆ Maria Powell ┆ 836.602.8286x06763 │
│ 2 ┆ Jacqueline Fletcher PhD ┆ +1-801-639-6835 │
│ 3 ┆ Mary Wilson ┆ (999)905-1935 │
│ 4 ┆ Kenneth Davis ┆ 332-275-7129x0289 │
│ 5 ┆ Christopher Martin ┆ (792)727-8878x82819 │
│ 6 ┆ Patricia Brown ┆ 363-719-4855x7138 │
│ 7 ┆ Richard Hodges ┆ (831)255-4407x13496 │
│ 8 ┆ Steve Green ┆ 671-700-3796x127 │
│ 9 ┆ Luis Smith ┆ 844.513.7054x4915 │
│ 10 ┆ Morgan Hensley ┆ 001-741-524-3690x958 │
└─────┴─────────────────────────┴──────────────────────┘
--- ith_df = 2
shape: (10, 3)
┌─────┬────────────────────┬────────────────────────┐
│ id ┆ name ┆ phone │
│ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ str │
╞═════╪════════════════════╪════════════════════════╡
│ 11 ┆ Tanya Peck ┆ 819-449-4406 │
│ 12 ┆ Mr. Joseph Parrish ┆ 409-954-9495x85424 │
│ 13 ┆ Juan Frazier ┆ 282.948.5514x337 │
│ 14 ┆ David Murray ┆ 902-780-8331x8359 │
│ 15 ┆ Ryan Campbell ┆ 7713098760 │
│ 16 ┆ Debra Harris ┆ 984.923.3706 │
│ 17 ┆ Andrew Lee ┆ 754-587-2094 │
│ 18 ┆ James Miller ┆ 369.515.1819x923 │
│ 19 ┆ Emma Gentry ┆ 001-323-490-9924x59516 │
│ 20 ┆ Lindsay Meza ┆ 466-888-0910x674 │
└─────┴────────────────────┴────────────────────────┘
--- ith_df = 3
shape: (10, 3)
┌─────┬────────────────────┬──────────────────────┐
│ id ┆ name ┆ phone │
│ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ str │
╞═════╪════════════════════╪══════════════════════╡
│ 21 ┆ Daniel Walker MD ┆ +1-832-758-6101x9324 │
│ 22 ┆ Jon Howard ┆ 666-699-5756x1435 │
│ 23 ┆ Shannon Johnson ┆ 8148458256 │
│ 24 ┆ Erica Keller ┆ 864.464.0633 │
│ 25 ┆ Betty Perez ┆ 869.484.7373 │
│ 26 ┆ John Wiley Jr. ┆ 882-603-1099x880 │
│ 27 ┆ Samantha Gutierrez ┆ (688)944-5982x705 │
│ 28 ┆ Matthew Tucker ┆ +1-577-433-9373x084 │
│ 29 ┆ Jennifer Black ┆ (832)819-8567 │
│ 30 ┆ Sarah Pittman ┆ 6854545835 │
└─────┴────────────────────┴──────────────────────┘
--- ith_df = 4
shape: (10, 3)
┌─────┬────────────────────┬──────────────────────┐
│ id ┆ name ┆ phone │
│ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ str │
╞═════╪════════════════════╪══════════════════════╡
│ 31 ┆ Dennis Vaughan ┆ 6862893476 │
│ 32 ┆ Kevin Martinez ┆ (419)553-1369 │
│ 33 ┆ Victor Norman ┆ 303-258-9000x83809 │
│ 34 ┆ Daniel Murray ┆ 675-839-2466 │
│ 35 ┆ Patricia Simmons ┆ (816)705-8827x9632 │
│ 36 ┆ Emma Gomez ┆ 653.227.9975 │
│ 37 ┆ Charles Robinson ┆ +1-687-438-5234 │
│ 38 ┆ Anthony Smith ┆ +1-323-237-2932 │
│ 39 ┆ Steven Bennett ┆ 6106982086 │
│ 40 ┆ Francisco Gonzalez ┆ 001-427-821-4543x436 │
└─────┴────────────────────┴──────────────────────┘
--- ith_df = 5
shape: (10, 3)
┌─────┬───────────────────┬──────────────────────┐
│ id ┆ name ┆ phone │
│ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ str │
╞═════╪═══════════════════╪══════════════════════╡
│ 41 ┆ Stephanie Shaffer ┆ 001-999-779-0085x959 │
│ 42 ┆ Tina Allen ┆ 669.671.7571 │
│ 43 ┆ Matthew Harrell ┆ 678.831.5948 │
│ 44 ┆ John Hill ┆ 355.476.5055 │
│ 45 ┆ James Shea ┆ +1-903-530-8220x480 │
│ 46 ┆ William Keller ┆ (695)744-1587x0883 │
│ 47 ┆ Vickie Perez ┆ +1-297-963-9194x6132 │
│ 48 ┆ William Howard ┆ 630.391.0772x26252 │
│ 49 ┆ Laura Nash ┆ (495)680-1361x6763 │
│ 50 ┆ Donald Johnson ┆ (282)973-8109x024 │
└─────┴───────────────────┴──────────────────────┘
[24]:
# When total number of row is NOT multiplier of ``n_rows``
# It's ok that the last sub dataframe doesn't have enough rows
for ith_df, sub_df in enumerate(df.iter_slices(n_rows=13), start=1):
print(f"--- {ith_df = }")
print(sub_df)
--- ith_df = 1
shape: (13, 3)
┌─────┬─────────────────────────┬──────────────────────┐
│ id ┆ name ┆ phone │
│ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ str │
╞═════╪═════════════════════════╪══════════════════════╡
│ 1 ┆ Maria Powell ┆ 836.602.8286x06763 │
│ 2 ┆ Jacqueline Fletcher PhD ┆ +1-801-639-6835 │
│ 3 ┆ Mary Wilson ┆ (999)905-1935 │
│ 4 ┆ Kenneth Davis ┆ 332-275-7129x0289 │
│ 5 ┆ Christopher Martin ┆ (792)727-8878x82819 │
│ … ┆ … ┆ … │
│ 9 ┆ Luis Smith ┆ 844.513.7054x4915 │
│ 10 ┆ Morgan Hensley ┆ 001-741-524-3690x958 │
│ 11 ┆ Tanya Peck ┆ 819-449-4406 │
│ 12 ┆ Mr. Joseph Parrish ┆ 409-954-9495x85424 │
│ 13 ┆ Juan Frazier ┆ 282.948.5514x337 │
└─────┴─────────────────────────┴──────────────────────┘
--- ith_df = 2
shape: (13, 3)
┌─────┬─────────────────┬───────────────────┐
│ id ┆ name ┆ phone │
│ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ str │
╞═════╪═════════════════╪═══════════════════╡
│ 14 ┆ David Murray ┆ 902-780-8331x8359 │
│ 15 ┆ Ryan Campbell ┆ 7713098760 │
│ 16 ┆ Debra Harris ┆ 984.923.3706 │
│ 17 ┆ Andrew Lee ┆ 754-587-2094 │
│ 18 ┆ James Miller ┆ 369.515.1819x923 │
│ … ┆ … ┆ … │
│ 22 ┆ Jon Howard ┆ 666-699-5756x1435 │
│ 23 ┆ Shannon Johnson ┆ 8148458256 │
│ 24 ┆ Erica Keller ┆ 864.464.0633 │
│ 25 ┆ Betty Perez ┆ 869.484.7373 │
│ 26 ┆ John Wiley Jr. ┆ 882-603-1099x880 │
└─────┴─────────────────┴───────────────────┘
--- ith_df = 3
shape: (13, 3)
┌─────┬────────────────────┬─────────────────────┐
│ id ┆ name ┆ phone │
│ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ str │
╞═════╪════════════════════╪═════════════════════╡
│ 27 ┆ Samantha Gutierrez ┆ (688)944-5982x705 │
│ 28 ┆ Matthew Tucker ┆ +1-577-433-9373x084 │
│ 29 ┆ Jennifer Black ┆ (832)819-8567 │
│ 30 ┆ Sarah Pittman ┆ 6854545835 │
│ 31 ┆ Dennis Vaughan ┆ 6862893476 │
│ … ┆ … ┆ … │
│ 35 ┆ Patricia Simmons ┆ (816)705-8827x9632 │
│ 36 ┆ Emma Gomez ┆ 653.227.9975 │
│ 37 ┆ Charles Robinson ┆ +1-687-438-5234 │
│ 38 ┆ Anthony Smith ┆ +1-323-237-2932 │
│ 39 ┆ Steven Bennett ┆ 6106982086 │
└─────┴────────────────────┴─────────────────────┘
--- ith_df = 4
shape: (11, 3)
┌─────┬────────────────────┬──────────────────────┐
│ id ┆ name ┆ phone │
│ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ str │
╞═════╪════════════════════╪══════════════════════╡
│ 40 ┆ Francisco Gonzalez ┆ 001-427-821-4543x436 │
│ 41 ┆ Stephanie Shaffer ┆ 001-999-779-0085x959 │
│ 42 ┆ Tina Allen ┆ 669.671.7571 │
│ 43 ┆ Matthew Harrell ┆ 678.831.5948 │
│ 44 ┆ John Hill ┆ 355.476.5055 │
│ … ┆ … ┆ … │
│ 46 ┆ William Keller ┆ (695)744-1587x0883 │
│ 47 ┆ Vickie Perez ┆ +1-297-963-9194x6132 │
│ 48 ┆ William Howard ┆ 630.391.0772x26252 │
│ 49 ┆ Laura Nash ┆ (495)680-1361x6763 │
│ 50 ┆ Donald Johnson ┆ (282)973-8109x024 │
└─────┴────────────────────┴──────────────────────┘
Concatenate#
Concatenate Vertically (More rows, same columns)#
DataFrame.extend will edit the first DataFrame in-place!
[25]:
df1 = pl.DataFrame({"id": [1, 2, 3]})
df2 = pl.DataFrame({"id": [4, 5, 6]})
df1.extend(df2)
[25]:
shape: (6, 1)
| id |
|---|
| i64 |
| 1 |
| 2 |
| 3 |
| 4 |
| 5 |
| 6 |
[26]:
df1
[26]:
shape: (6, 1)
| id |
|---|
| i64 |
| 1 |
| 2 |
| 3 |
| 4 |
| 5 |
| 6 |
DataFrame.vstack DOES NOT edit the first DataFrame in-place!
[27]:
df1 = pl.DataFrame({"id": [1, 2, 3]})
df2 = pl.DataFrame({"id": [4, 5, 6]})
df1.vstack(df2)
[27]:
shape: (6, 1)
| id |
|---|
| i64 |
| 1 |
| 2 |
| 3 |
| 4 |
| 5 |
| 6 |
[28]:
df1
[28]:
shape: (3, 1)
| id |
|---|
| i64 |
| 1 |
| 2 |
| 3 |
[30]:
# It won't work because the columns are different
df1 = pl.DataFrame({"id": [1, 2, 3]})
df2 = pl.DataFrame({"id": [4, 5, 6], "name": ["d", "e", "f"]})
try:
df1.vstack(df2)
except Exception as e:
print(repr(e))
ShapeError('unable to append to a DataFrame of width 1 with a DataFrame of width 2')
Concatenate Horizontally (More columns, same rows)#
DataFrame.hstack DOES NOT edit the first DataFrame in-place!
[31]:
df1 = pl.DataFrame({"id": [1, 2, 3]})
df2 = pl.DataFrame({"name": ["a", "b", "c"]})
df1.hstack(df2)
[31]:
shape: (3, 2)
| id | name |
|---|---|
| i64 | str |
| 1 | "a" |
| 2 | "b" |
| 3 | "c" |
[32]:
df1
[32]:
shape: (3, 1)
| id |
|---|
| i64 |
| 1 |
| 2 |
| 3 |
[33]:
# It won't work because the rows are different
df1 = pl.DataFrame({"id": [1, 2, 3]})
df2 = pl.DataFrame({"name": ["a", "b", "c", "d", "e"]})
try:
df1.hstack(df2)
except Exception as e:
print(repr(e))
ShapeError('could not create a new DataFrame: series "id" has length 3 while series "name" has length 5')
[ ]: