Find Median in a Stream

Find median in a stream

Problem Statement

Given are some integers, which are read from the data stream. The task is to find the median of the integers read so far.

The median is the middle value of a sorted list of integers. If the size of the list is even, the median is the average of the two middle elements.

Examples:
Input: [1, 2, 3,…]
Output: [1, 1.5, 2..]
Explanation:

  • The list contains [1]. So, median = 1 / 1 = 1
  • The list contains [1, 2]. Median = (1 + 2) / 2 = 1.5
  • The list contains [1, 2, 3]. Median = (1 + 2 + 3) / 3 = 2

Approach 1: Sorting

The most basic approach is to store the integers in a list and sort the list every time for calculating the median.

Algorithm:

  • Initialize a list for storing the integers.
  • Sort the list every time.
  • Print the median.

C++ Code

class MedianFinder {
  vector < int > store;
 
  public:
    void addNum(int num) {
      store.push_back(num);
    }
  double findMedian() {
    sort(store.begin(), store.end());
 
    int n = store.size();
    return (n & 1 ? store[n / 2] : ((double) store[n / 2 - 1] + store[n / 2]) * 0.5);
  }
};

Java Code

 class MedianFinder {
  List < Integer > input = new ArrayList < > ();
 
  public void addNum(int num) {
    input.add(num);
  }
 
  public double findMedian() {
    Collections.sort(input);
    int len = input.size();
    if (len % 2 == 1) return input.get(len / 2);
    return 0.5 * (input.get(len / 2) + input.get(len / 2 - 1));
  }
}

Python Code

class MedianFinder:
    def __init__(self):
        self.array = []
 
    def addNum(self, num: int) -> None:
        self.array.append(num)
 
    def findMedian(self) -> float:
        self.array.sort()
        n = len(self.array)
        if not self.array or n == 1:
            return 0 or self.array[0]
        elif n & 1:
            return self.array[n // 2]
        else:
            return (self.array[n // 2] + self.array[(n // 2) - 1]) / 2

Time Complexity:O(NlogN), where N is a number of elements.
Space Complexity:O(N), for storing list.

Approach 2: Insertion Sort

In the previous approach, we sorted the list every time. Instead of sorting, we can insert the item in their specific position to keep the list always sorted.

Algorithm:

  • We assume the list is already sorted.
  • When a new integer comes up, apply binary search to insert the integer in its correct position.
  • Now, the list is sorted and you can find the median.

C++ Implementation

class MedianFinder {
  vector < int > store;
 
  public:
    void addNum(int num) {
      if (store.empty())
        store.push_back(num);
      else
        store.insert(lower_bound(store.begin(), store.end(), num), num);
    }
  double findMedian() {
    int n = store.size();
    return n & 1 ? store[n / 2] : ((double) store[n / 2 - 1] + store[n / 2]) * 0.5;
  }
};

Java Implementation

 class MedianFinder {
  List < Integer > data;
  public MedianFinder() {
    data = new ArrayList();
  }
 
  public void addNum(int num) {
    int idx = Collections.binarySearch(data, num);
    if (idx >= 0) {
      data.add(idx, num);
    } else {
      data.add(-idx - 1, num);
    }
  }
 
  public double findMedian() {
    int len = data.size();
    int mid = data.get(len / 2);
    if (len % 2 == 1) {
      return mid;
    } else {
      return 1.0 * ((data.get(len / 2 - 1)) + mid) / 2;
    }
  }
}

Python Implementation

class MedianFinder:
    def __init__(self):
        self.q = []
 
    def addNum(self, num):
        bisect.insort(self.q, num)
 
    def findMedian(self):
        n = len(self.q)
        m = n >> 1
        return self.q[m] if n & 1 else (self.q[m - 1] + self.q[m]) / 2

Time Complexity: O(N), where N is a number of elements.
Space Complexity: O(N), for storing list.

Approach 3: Two Heaps

The idea is to use a max heap and a min-heap. Max heap is used to store the smaller half of the number and the min-heap is used to store the larger half of the numbers.

Algorithm

  • If the current element to be added is less than the maximum element of the max heap, then add this to the max heap.
  • If the difference between the size of the max and min heap becomes greater than 1, the top element of the max heap is removed and added to the min-heap.
  • If the current element to be added is greater than the maximum element of the min-heap, then add this to the min-heap.
  • If the difference between the size of the max and min heap becomes greater than 1, the top element of the min-heap is removed and added to the max heap.

C++ Code For Two Heaps

void findMedian(int arr[], int n) {
  priority_queue < int > lowerHalf;
  priority_queue < int, vector < int > , greater < int >> higherHalf;
 
  int median;
  for (int size = 1; size <= n; size++) {
    if (!lowerHalf.empty() && lowerHalf.top() > arr[size - 1]) {
      lowerHalf.push(arr[size - 1]);
 
      if (lowerHalf.size() > higherHalf.size() + 1) {
        higherHalf.push(lowerHalf.top());
        lowerHalf.pop();
      }
    } else {
      higherHalf.push(arr[size - 1]);
 
      if (higherHalf.size() > lowerHalf.size() + 1) {
        lowerHalf.push(higherHalf.top());
        higherHalf.pop();
      }
    }
    if (size % 2 == 1) {
      if (higherHalf.size() > lowerHalf.size()) {
        median = higherHalf.top();
      } else {
        median = lowerHalf.top();
      }
    } else {
      median = (lowerHalf.top() + higherHalf.top()) / 2;
    }
    cout << median << " ";
  }
}

Java Code For Two Heaps

public class Solution {
 
  public static void findMedian(int arr[]) {
    int n = arr.length;
 
    PriorityQueue < Integer > lowerHalf = new PriorityQueue < > (new Comparator < Integer > () {
      @Override
      public int compare(Integer first, Integer second) {
        return (second - first);
      }
    });
 
    PriorityQueue < Integer > higherHalf = new PriorityQueue < > ();
 
    int median;
    for (int size = 1; size <= n; size++) {
      if (!lowerHalf.isEmpty() && lowerHalf.peek() > arr[size - 1]) {
        lowerHalf.add(arr[size - 1]);
 
        if (lowerHalf.size() > higherHalf.size() + 1) {
          higherHalf.add(lowerHalf.poll());
        }
      } else {
        higherHalf.add(arr[size - 1]);
 
        if (higherHalf.size() > lowerHalf.size() + 1) {
          lowerHalf.add(higherHalf.poll());
        }
      }
      if (size % 2 == 1) {
        if (higherHalf.size() > lowerHalf.size()) {
          median = higherHalf.peek();
        } else {
          median = lowerHalf.peek();
        }
      } else {
        median = (lowerHalf.peek() + higherHalf.peek()) / 2;
      }
      System.out.print(median + " ");
    }
  }
 
}

Python Code For Two Heaps

from queue import PriorityQueue
 
INT_MAX = 10000000000
 
 
def findMedian(arr, n):
    lowerHalf = PriorityQueue()
    higherHalf = PriorityQueue()
    for size in range(1, n + 1):
 
        x = INT_MAX
        if lowerHalf.qsize() > 0:
 
            x = abs(lowerHalf.get())
            lowerHalf.put(-x)
 
        if lowerHalf.qsize() > 0 and x > arr[size - 1]:
            lowerHalf.put(-(arr[size - 1]))
 
            if lowerHalf.qsize() > higherHalf.qsize() + 1:
                higherHalf.put(abs(lowerHalf.get()))
 
        else:
 
            higherHalf.put(arr[size - 1])
 
            if higherHalf.qsize() > lowerHalf.qsize() + 1:
                lowerHalf.put(-(higherHalf.get()))
 
        if size % 2 == 1:
 
            if higherHalf.qsize() > lowerHalf.qsize():
 
                median = higherHalf.get()
                higherHalf.put(median)
 
            else:
 
                median = abs(lowerHalf.get())
                lowerHalf.put(-median)
 
        else:
 
            a = lowerHalf.get()
            lowerHalf.put(a)
            b = higherHalf.get()
            higherHalf.put(b)
 
            median = (abs(a) + b) // 2
 
        print(median, end=" ")

Time Complexity: O(NlogN), where N is the number of elements.
Space Complexity: O(N), for storing lists.

FAQs

Q. What is a median?
A. The median is the middle value of a sorted list of integers. If the size of the list is even, the median is the average of the two middle elements.

Q. What is the most efficient approach to solving this problem?
A. The approach using a min-heap and max heap is the most efficient approach to solve the problem. The time complexity is O(logN) and the space complexity is O(N).

Previous Post

Friends Pairing Problem (Solution)

Next Post

Best C Programming Books for Beginners & Expert