loccounter.go

// Copyright 2018 Christos Katsakioris
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package glocc

import (
	"bufio"
	"fmt"
	"os"
	"strings"
	"unicode/utf8"
)

// These states don't need to exist per LocCounter, as they don't carry any
// LocCounter-specific data.
var (
	globalStateInitial = &stateInitial{}
	globalStateCode    = &stateCode{}
)

// LocCounter is the core entity of the package, which initiates and later
// holds the state of the counting for a single file.
// It is associated to the counting of a single file, and created in the
// goroutine that is assigned to count the file.
type LocCounter struct {
	language language
	loc      int

	file            *os.File
	currLine        string
	currLineCounted bool
	fileLinesCnt    int

	state                 loccState
	stateMultiLineComment *stateMultiLineComment
}

// NewLocCounter returns a new LocCounter, properly initialized to count the
// lines of code in a specific file of a specific language.
// Returns an error if a supported language cannot be detected.
func NewLocCounter(file *os.File, ext string) (lc *LocCounter, err error) {
	if lang, valid := languages[ext]; !valid {
		err = fmt.Errorf("Cannot deduce a supported language from extension %q.", ext)
	} else {
		lc = &LocCounter{
			language: lang,
			file:     file,
			state:    globalStateInitial,
			stateMultiLineComment: &stateMultiLineComment{},
		}
	}
	return
}

// Count is the only exported method of LocCounter. It basically reads (line by
// line) the content of the file associated with the LocCounter, and performs
// the counting. It is implemented using the State design pattern.
func (lc *LocCounter) Count() (int, error) {
	logger.Printf("DEBUG LocCounter.Count() for file %q: Starting...\n", lc.file.Name())
	fsc := bufio.NewScanner(lc.file)
	for fsc.Scan() {
		lc.fileLinesCnt++
		lc.currLine = fsc.Text()
		lc.currLine = strings.TrimLeft(lc.currLine, " \t") // trim leading whitespace
		lc.currLineCounted = false
		for !lc.state.process(lc) {
		}
		if lc.currLineCounted {
			logger.Printf("DEBUG %q:%d --> Counted\n", lc.file.Name(), lc.fileLinesCnt)
			lc.loc++
		} else {
			logger.Printf("DEBUG %q:%d --> Discarded\n", lc.file.Name(), lc.fileLinesCnt)
		}
	}
	if err := fsc.Err(); err != nil {
		logger.Println("ERROR", err)
		return lc.loc, err
	}

	logger.Printf("DEBUG LocCounter.Count() for file %q: Finished.\n", lc.file.Name())
	return lc.loc, nil
}

// Change the state of the LocCounter.
func (lc *LocCounter) setState(state loccState) {
	lc.state = state
}

// Returns true if current line is empty; false otherwise.
func (lc *LocCounter) lineIsEmpty() bool {
	if len(lc.currLine) == 0 {
		return true
	}
	return false
}

// Returns the index of the first inline comment token that was found in
// current line, or the length of current line if none was found.
func (lc *LocCounter) inlineCommentIndex() int {
	firstInlineCommTokenIdx := len(lc.currLine)
	for _, t := range lc.language.inlineCommentTokens {
		ilcIdx := strings.Index(lc.currLine, t)
		if ilcIdx != -1 && ilcIdx < firstInlineCommTokenIdx {
			firstInlineCommTokenIdx = ilcIdx
		}
	}
	if firstInlineCommTokenIdx < len(lc.currLine) {
		logger.Printf("DEBUG Inline comment token found at %q:%d\n", lc.file.Name(), lc.fileLinesCnt)
	}
	return firstInlineCommTokenIdx
}

// The current state of a LocCounter. It may change from zero to multiple times
// while processing the same single line.
// Part of the State design pattern implementation.
type loccState interface {
	// The bool returned shows whether we're done processing currLine, so
	// as to break from the loop that LoccState.process() was called in.
	process(*LocCounter) bool
}

// The initial state in which every LocCounter starts in.
type stateInitial struct{}

// Line processing method for state stateInitial.
func (s *stateInitial) process(lc *LocCounter) bool {
	firstInlineCommTokenIdx := lc.inlineCommentIndex()
	if lc.lineIsEmpty() || firstInlineCommTokenIdx == 0 {
		return true
	}
	// On the first non-empty and non-inline-commented-out line, the state is changing.
	// Find the first occurrence of a multi-line comment starting token, if any.
	firstMultiLineCommTokenIdx, firstMultiLineCommToken := len(lc.currLine), ""
	for _, t := range lc.language.multiLineCommentStartingTokens {
		mlcIdx := strings.Index(lc.currLine, t)
		if mlcIdx != -1 && mlcIdx < firstMultiLineCommTokenIdx {
			firstMultiLineCommTokenIdx = mlcIdx
			firstMultiLineCommToken = t
		}
	}
	// If a multi-line comment starting token was found before the first inline comment token
	if firstMultiLineCommTokenIdx < firstInlineCommTokenIdx {
		logger.Printf("DEBUG Multi-line comment starting at %q:%d\n", lc.file.Name(), lc.fileLinesCnt)
		// If it wasn't in the beginning of the line
		if firstMultiLineCommTokenIdx > 0 {
			lc.currLineCounted = true
		}
		// Immediately continue processing the rest of the line in stateMultiLineComment,
		// as the state may change again within the same line.
		lc.currLine = strings.TrimLeft(lc.currLine[(firstMultiLineCommTokenIdx+len(firstMultiLineCommToken)):], " \t")
		lc.stateMultiLineComment.setToken(firstMultiLineCommToken)
		lc.setState(lc.stateMultiLineComment)
	} else {
		// If no multi-line comment starting token was found before the first inline comment token
		lc.setState(globalStateCode)
	}
	// State has to change from stateInitial in any case.
	return false
}

// The state of the LocCounter currently processing multi-line commented code.
type stateMultiLineComment struct {
	// Needed for Python (or any other language that I may not know of,
	// similar to Python in) that they need to nest e.g. occurrences of
	// `'''` in a `"""` multi-line comment, and of `"""` in a `'''`
	// multi-line comment.
	token string
}

// Line processing method for state stateMultiLineComment.
func (s *stateMultiLineComment) process(lc *LocCounter) bool {
	// Based on the observation that all supported languages actually use the
	// same token for closing block comments as for opening, only reversed.
	// Exceptions (handled) to this (for now): Ruby, and Java, PHP for docstrings.
	tokens := []string{} // the tokens which change the state
	reversedToken := reversed(lc.stateMultiLineComment.token)
	reversedTokenIsValid := false
	for _, t := range lc.language.multiLineCommentEndingTokens {
		if t == reversedToken {
			reversedTokenIsValid = true
			break
		}
	}
	if reversedTokenIsValid {
		tokens = append(tokens, reversedToken)
	} else {
		tokens = append(tokens, lc.language.multiLineCommentEndingTokens...)
	}

	// Find the first occurrence of a multi-line comment ending token, if any
	firstMultiLineCommTokenIdx, firstMultiLineCommToken := len(lc.currLine), ""
	for _, t := range tokens {
		mlcIdx := strings.Index(lc.currLine, t)
		if mlcIdx != -1 && mlcIdx < firstMultiLineCommTokenIdx {
			firstMultiLineCommTokenIdx = mlcIdx
			firstMultiLineCommToken = t
		}
	}
	// If a multi-line comment ending token was found
	if firstMultiLineCommTokenIdx < len(lc.currLine) {
		logger.Printf("DEBUG Multi-line comment ending at %q:%d\n", lc.file.Name(), lc.fileLinesCnt)
		s.token = ""
		lc.currLine = strings.TrimLeft(lc.currLine[(firstMultiLineCommTokenIdx+len(firstMultiLineCommToken)):], " \t")
		lc.setState(globalStateCode)
		return false
	}
	// If no multi-line comment ending token was found
	return true
}

// Change the saved token in stateMultiLineComment, and return the state struct
// itself.
func (s *stateMultiLineComment) setToken(token string) {
	s.token = token
}

// The state of the LocCounter currently processing code that needs to be
// counted in.
type stateCode struct{}

// Line processing method for state stateCode.
func (s *stateCode) process(lc *LocCounter) bool {
	firstInlineCommTokenIdx := lc.inlineCommentIndex()
	if lc.lineIsEmpty() || firstInlineCommTokenIdx == 0 {
		return true
	}
	// Find the first occurrence of a multi-line comment starting token, if any.
	firstMultiLineCommTokenIdx, firstMultiLineCommToken := len(lc.currLine), ""
	for _, t := range lc.language.multiLineCommentStartingTokens {
		mlcIdx := strings.Index(lc.currLine, t)
		if mlcIdx != -1 && mlcIdx < firstMultiLineCommTokenIdx {
			firstMultiLineCommTokenIdx = mlcIdx
			firstMultiLineCommToken = t
		}
	}
	// If a multi-line comment starting token was found before the first occurrence of an inline comment token
	if firstMultiLineCommTokenIdx < firstInlineCommTokenIdx {
		logger.Printf("DEBUG Multi-line comment start found at %q:%d\n", lc.file.Name(), lc.fileLinesCnt)
		// If it wasn't in the beginning of the line
		if firstMultiLineCommTokenIdx > 0 {
			lc.currLineCounted = true
		}
		// Immediately continue processing the rest of the line in stateMultiLineComment,
		// as the state may change again within the same line.
		lc.currLine = strings.TrimLeft(lc.currLine[(firstMultiLineCommTokenIdx+len(firstMultiLineCommToken)):], " \t")
		lc.stateMultiLineComment.setToken(firstMultiLineCommToken)
		lc.setState(lc.stateMultiLineComment)
		return false
	}
	lc.currLineCounted = true
	return true
}

// Returns the input string reversed.
func reversed(s string) string {
	size := len(s)
	buf := make([]byte, size)
	for i := 0; i < size; {
		r, n := utf8.DecodeRuneInString(s[i:])
		i += n
		utf8.EncodeRune(buf[(size-i):], r)
	}
	return string(buf)
}