Scenario:
Suppose, we have a dataset(ds_Sample) as below with duplicate values in a Prod_ID column and some duplicate rows:
Suppose, we have a dataset(ds_Sample) as below with duplicate values in a Prod_ID column and some duplicate rows:
let
Source = #table (
{"Trans_ID", "Prod_ID", "Prod_Name"},
{
{12345, 123, "Laptop"},
{12346, 124, "Radio"},
{12347, 125, "Keyboard"},
{12348, 126, "Television"},
{12349, 127, "Printer"},
{12350, 128, "Scanner"},
{12351, 129, "Camera"},
{12352, 130, "Tripod"},
{12345, 123, "Laptop"}, // duplicate row
{12348, 126, "Television"}, // duplicate row
{12351, 129, "Camera"}, // duplicate row
{12356, 127, "Printer"},
{12357, 124, "Radio"},
{12358, 126, "Television"}
}
),
ChangeType = Table.TransformColumnTypes(Source, {
{"Trans_ID", Int64.Type},
{"Prod_ID", Int64.Type},
{"Prod_Name", type text}
})
in
ChangeType
Now can identify and flag the duplicate values and rows using below methods.
Method-1: Using Text.Combine method with Composite Key based on specific Columns:
let
/* Source dataset */
Source = ds_Sample,
/* Columns to include in Composite Key */
SelectedCols = {"Trans_ID","Prod_ID"},
/* Composite Key from selected columns */
Def_CompositeKey = Table.AddColumn(Source, "Composite_Key",
each Text.Combine(
List.Transform(
Record.ToList(Record.SelectFields(_, SelectedCols)),
each Text.From(_)
),
"|"
)
),
/* Flag duplicate rows using Composite Key */
Flag_DupRow = Table.AddColumn(Def_CompositeKey, "IsDuplicate_Row",
each if List.Count(List.Select(Def_CompositeKey[Composite_Key], (x) => x = [Composite_Key])) > 1
then "Yes" else "No"
),
/* Flag duplicate products by Prod_ID */
Flag_DupProd = Table.AddColumn(Flag_DupRow, "IsDuplicate_Prod",
each if List.Count(List.Select(Source[Prod_ID], (x) => x = [Prod_ID])) > 1
then "Yes" else "No"
)
in
Flag_DupProd
Method-2: Using Text.ToBinary method and Composite Key (Binary Encoded) based on specific Columns:
let
// Source dataset
Source = ds_Sample,
// Columns to include in Composite_Hash_Key
SelectedCols = {"Trans_ID","Prod_ID"},
// Generate Composite_Hash_Key from selected columns only
Def_CompositeKey = Table.AddColumn(Source, "Composite_Hash_Key",
each Binary.ToText(
Text.ToBinary(
Text.Combine(
List.Transform(
Record.ToList(Record.SelectFields(_, SelectedCols)),
each Text.From(_)
),
"|"
)
),
BinaryEncoding.Base64
)
),
// Flag duplicate rows using Composite_Hash_Key
Flag_DupRow = Table.AddColumn(Def_CompositeKey, "IsDuplicate_Row",
each if List.Count(List.Select(Def_CompositeKey[Composite_Hash_Key], (x) => x = [Composite_Hash_Key])) > 1
then "Yes" else "No"
),
// Flag duplicate products by ProdID
Flag_DupProd = Table.AddColumn(Flag_DupRow, "IsDuplicate_Prod",
each if List.Count(List.Select(Source[Prod_ID], (x) => x = [Prod_ID])) > 1
then "Yes" else "No"
)
in
Flag_DupProd
------------------------------------------------
Method-3: Using Table.Group method to identify the Duplicate values based on Prod_ID:
let
Source = ds_Sample,
// Count how many times each ProdID appears
GroupByProd = Table.Group(Source, {"Prod_ID"}, {{"Prod_Count", each Table.RowCount(_), Int64.Type}}),
// Join back to original details table
MergeToDetails = Table.NestedJoin(Source, {"Prod_ID"}, GroupByProd, {"Prod_ID"}, "ProdCount"),
ExpandTable = Table.ExpandTableColumn(MergeToDetails, "ProdCount", {"Prod_Count"}),
// Add a column to flag duplicate values
FlagDup_Prod = Table.AddColumn(ExpandTable, "IsDuplicate_Prod", each if [Prod_Count] > 1 then "Yes" else "No")
in
FlagDup_Prod
Result:
------------------------------------------------
Method-4: Using Table.Group method to return the no. of Duplicate rows and products:
let
Source = ds_Sample,
// 1. Count duplicate rows
DistinctRows = Table.Distinct(Source),
DuplicateRowCount = Table.RowCount(Source) - Table.RowCount(DistinctRows),
// 2. Count duplicate products (ProdID)
GroupByProd = Table.Group(Source, {"Prod_ID"}, {{"Count", each Table.RowCount(_), Int64.Type}}),
DuplicateProd = Table.RowCount(Table.SelectRows(GroupByProd, each [Count] > 1)),
// 3. Return summary of results
Summary= #table(
{"Metric", "Value"},
{
{"Duplicate Rows", DuplicateRowCount},
{"Duplicate Products", DuplicateProd}
}
)
in
Summary
--------------------------------------------------------------------------------------------------------
Thanks
--------------------------------------------------------------------------------------------------------