% stanzaSeg.m % takes in an image and returns an array of the Y-values between the stanza on the page % % arguments: imagename - string of filename % returns: coorArray - array of Y-values seperating stanzas function coorArray=stanzaSeg(imagename) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %crop the original image so that only the page left %read image to be segmented pic = imread(imagename); %get dimensions of image [m,n,k] = size(pic); %convert the image to grayscale level = graythresh(pic); pic1 = im2bw(pic,level+0.2); %find the percentage of white space per column rowMean = mean(pic1); %find the coordinates for the top, bottom, left, and right %sides of the page by looking for the first row/column that has a lot of %white space. %if no new coordinates are found, use the edge of the image %start at left side of image, find first instance where at least 10 rows are 50% white or more heading right left = 0; for i=1:10:700 j=i+50; if(rowMean(i) > 0.5 && rowMean(j) > 0.5) left = i; break; end end %start at right side of image, find first instance where at least 10 rows are 50% white or more heading left right = n; for i=n:-10:n-700 j=i-50; if(rowMean(i) > 0.5 && rowMean(j) > 0.5) right = i; break; end end %rotate the b/w image so that the above can be done for the top and bottom %of the image pic2 = imrotate(pic1,90); colMean = mean(pic2); top = 0; for i=1:10:700 j=i+50; if(colMean(i) > 0.2 && colMean(j) > 0.2) top = i; break; end end bottom = m; for i=m:-10:m-500 j=i-20; if(colMean(i) > 0.5 && colMean(j) > 0.5) bottom = i; break; end end %crop the image with the coordinates to get just the page cropPic = imcrop(pic,[left top right bottom-top]); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %create an array that will be used to hold the coordiantes for the stanzas %on the page coorArray = zeros(1); %get size cooordinates, make b/w, rotate, and get get means from the %cropped image, as well as the maxiumum value on the means [m,n,k] = size(cropPic); level = graythresh(cropPic); pic1 = im2bw(cropPic,level+0.1); pic2 = imrotate(pic1,90); colMean = mean(pic2); white = max(colMean); %create a temporary array to hold the y-value coordinates of where the stanza breaks are yValues = zeros(6); %starting break will be where the top of the cropped page is %"top" is always added to the coordinates because they are found from the cropped image, values need to work with the original image yValues(1) = top; %keep track of number of elements in the array %this is incremented every time an element is added to the array count = 1; %keep track of the beginning and the end (fin) of a block of white space dividing two stanzas begin = 0; fin = 0; %keep track of whether or not the program is already looking at white space %works like a boolean started = 1; %for loop to find the stanza breaks %start at top, increment by 20, stop and end of page for i=1:20:m %if not already in white space, but in white space now: set "begin" to current and "started" to 1 if(started == 0 && colMean(i) > white-0.03) begin = i; started = 1; %if not already in white space, and not in white space now: do nothing elseif(started == 0 && colMean(i) < white-0.03) %if already in white space, and in white space now: update "fin" to keep track of end of white space up to now elseif(started == 1 && colMean(i) > white-0.03) fin = i; %if in white space already, and out of white space now: check to see if y-value needs to be added to the array elseif(started == 1 && colMean(i) < white-0.03) %if white space was less than 20 pixels (fin never changed from 0): reset variables and do NOT add if(fin == 0) begin = 0; started = 0; %else: find the midpoint between "begin" and "fin" and add that value (plus top) to the array %reset variables else mid = (begin+fin)/2; yValues(count+1) = mid+top; count = count+1; started = 0; begin = 0; fin = 0; end end %if line is completely black (bottom of page): add to array and break the for loop %this is just in case there is black space at the bottom of the crop if(colMean(i) == 0) yValues(count+1) = i+top; count = count+1; break; end end %check distance from last value in the array to the bottom of the page %if greater that 200 pixels, add bottom of page coordinate to the array %used if bottom of page is mostly white and coordinates were not added after the final stanza if(m+top - yValues(count) > 200) yValues(count+1) = m+top; count = count+1; end %copy contents of yValues to coorArray since coorArray will be returned for i=1:count coorArray(i) = yValues(i); end end